c608a660c22a0eaf8f9a8b196fc7b43bcb10eea1
[alexxy/gromacs.git] / src / gromacs / nbnxm / kerneldispatch.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6  * and including many others, as listed in the AUTHORS file in the
7  * top-level source directory and at http://www.gromacs.org.
8  *
9  * GROMACS is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public License
11  * as published by the Free Software Foundation; either version 2.1
12  * of the License, or (at your option) any later version.
13  *
14  * GROMACS is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with GROMACS; if not, see
21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
23  *
24  * If you want to redistribute modifications to GROMACS, please
25  * consider that scientific software is very special. Version
26  * control is crucial - bugs must be traceable. We will be happy to
27  * consider code for inclusion in the official distribution, but
28  * derived work must not be called official GROMACS. Details are found
29  * in the README & COPYING files - if they are missing, get the
30  * official version at http://www.gromacs.org.
31  *
32  * To help us fund GROMACS development, we humbly ask that you cite
33  * the research papers on the package. Check out http://www.gromacs.org.
34  */
35
36 #include "gmxpre.h"
37
38 #include "gromacs/gmxlib/nrnb.h"
39 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
40 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
41 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
42 #include "gromacs/math/vectypes.h"
43 #include "gromacs/mdlib/enerdata_utils.h"
44 #include "gromacs/mdlib/force.h"
45 #include "gromacs/mdlib/force_flags.h"
46 #include "gromacs/mdlib/gmx_omp_nthreads.h"
47 #include "gromacs/mdtypes/enerdata.h"
48 #include "gromacs/mdtypes/forceoutput.h"
49 #include "gromacs/mdtypes/inputrec.h"
50 #include "gromacs/mdtypes/interaction_const.h"
51 #include "gromacs/mdtypes/md_enums.h"
52 #include "gromacs/mdtypes/mdatom.h"
53 #include "gromacs/nbnxm/gpu_data_mgmt.h"
54 #include "gromacs/nbnxm/nbnxm.h"
55 #include "gromacs/nbnxm/nbnxm_simd.h"
56 #include "gromacs/nbnxm/kernels_reference/kernel_gpu_ref.h"
57 #include "gromacs/simd/simd.h"
58 #include "gromacs/timing/wallcycle.h"
59 #include "gromacs/utility/gmxassert.h"
60 #include "gromacs/utility/real.h"
61
62 #include "kernel_common.h"
63 #include "pairlistset.h"
64 #include "pairlistsets.h"
65 #define INCLUDE_KERNELFUNCTION_TABLES
66 #include "gromacs/nbnxm/kernels_reference/kernel_ref.h"
67 #ifdef GMX_NBNXN_SIMD_2XNN
68 #include "gromacs/nbnxm/kernels_simd_2xmm/kernels.h"
69 #endif
70 #ifdef GMX_NBNXN_SIMD_4XN
71 #include "gromacs/nbnxm/kernels_simd_4xm/kernels.h"
72 #endif
73 #undef INCLUDE_FUNCTION_TABLES
74
75 /*! \brief Clears the energy group output buffers
76  *
77  * \param[in,out] out  nbnxn kernel output struct
78  */
79 static void clearGroupEnergies(nbnxn_atomdata_output_t *out)
80 {
81     std::fill(out->Vvdw.begin(), out->Vvdw.end(), 0.0_real);
82     std::fill(out->Vc.begin(), out->Vc.end(), 0.0_real);
83     std::fill(out->VSvdw.begin(), out->VSvdw.end(), 0.0_real);
84     std::fill(out->VSc.begin(), out->VSc.end(), 0.0_real);
85 }
86
87 /*! \brief Reduce the group-pair energy buffers produced by a SIMD kernel
88  * to single terms in the output buffers.
89  *
90  * The SIMD kernels produce a large number of energy buffer in SIMD registers
91  * to avoid scattered reads and writes.
92  *
93  * \tparam        unrollj         The unroll size for j-particles in the SIMD kernel
94  * \param[in]     numGroups       The number of energy groups
95  * \param[in]     numGroups_2log  Log2 of numGroups, rounded up
96  * \param[in,out] out             Struct with energy buffers
97  */
98 template <int unrollj> static void
99 reduceGroupEnergySimdBuffers(int                       numGroups,
100                              int                       numGroups_2log,
101                              nbnxn_atomdata_output_t  *out)
102 {
103     const int                 unrollj_half     = unrollj/2;
104     /* Energies are stored in SIMD registers with size 2^numGroups_2log */
105     const int                 numGroupsStorage = (1 << numGroups_2log);
106
107     const real * gmx_restrict vVdwSimd     = out->VSvdw.data();
108     const real * gmx_restrict vCoulombSimd = out->VSc.data();
109     real * gmx_restrict       vVdw         = out->Vvdw.data();
110     real * gmx_restrict       vCoulomb     = out->Vc.data();
111
112     /* The size of the SIMD energy group buffer array is:
113      * numGroups*numGroups*numGroupsStorage*unrollj_half*simd_width
114      */
115     for (int i = 0; i < numGroups; i++)
116     {
117         for (int j1 = 0; j1 < numGroups; j1++)
118         {
119             for (int j0 = 0; j0 < numGroups; j0++)
120             {
121                 int c = ((i*numGroups + j1)*numGroupsStorage + j0)*unrollj_half*unrollj;
122                 for (int s = 0; s < unrollj_half; s++)
123                 {
124                     vVdw    [i*numGroups + j0] += vVdwSimd    [c + 0];
125                     vVdw    [i*numGroups + j1] += vVdwSimd    [c + 1];
126                     vCoulomb[i*numGroups + j0] += vCoulombSimd[c + 0];
127                     vCoulomb[i*numGroups + j1] += vCoulombSimd[c + 1];
128                     c                          += unrollj + 2;
129                 }
130             }
131         }
132     }
133 }
134
135 /*! \brief Dispatches the non-bonded N versus M atom cluster CPU kernels.
136  *
137  * OpenMP parallelization is performed within this function.
138  * Energy reduction, but not force and shift force reduction, is performed
139  * within this function.
140  *
141  * \param[in]     pairlistSet   Pairlists with local or non-local interactions to compute
142  * \param[in]     kernelSetup   The non-bonded kernel setup
143  * \param[in,out] nbat          The atomdata for the interactions
144  * \param[in]     ic            Non-bonded interaction constants
145  * \param[in]     shiftVectors  The PBC shift vectors
146  * \param[in]     forceFlags    Flags that tell what to compute
147  * \param[in]     clearF        Enum that tells if to clear the force output buffer
148  * \param[out]    vCoulomb      Output buffer for Coulomb energies
149  * \param[out]    vVdw          Output buffer for Van der Waals energies
150  * \param[in]     wcycle        Pointer to cycle counting data structure.
151  */
152 static void
153 nbnxn_kernel_cpu(const PairlistSet              &pairlistSet,
154                  const Nbnxm::KernelSetup       &kernelSetup,
155                  nbnxn_atomdata_t               *nbat,
156                  const interaction_const_t      &ic,
157                  rvec                           *shiftVectors,
158                  int                             forceFlags,
159                  int                             clearF,
160                  real                           *vCoulomb,
161                  real                           *vVdw,
162                  gmx_wallcycle                  *wcycle)
163 {
164
165     int                      coulkt;
166     if (EEL_RF(ic.eeltype) || ic.eeltype == eelCUT)
167     {
168         coulkt = coulktRF;
169     }
170     else
171     {
172         if (kernelSetup.ewaldExclusionType == Nbnxm::EwaldExclusionType::Table)
173         {
174             if (ic.rcoulomb == ic.rvdw)
175             {
176                 coulkt = coulktTAB;
177             }
178             else
179             {
180                 coulkt = coulktTAB_TWIN;
181             }
182         }
183         else
184         {
185             if (ic.rcoulomb == ic.rvdw)
186             {
187                 coulkt = coulktEWALD;
188             }
189             else
190             {
191                 coulkt = coulktEWALD_TWIN;
192             }
193         }
194     }
195
196     const nbnxn_atomdata_t::Params &nbatParams = nbat->params();
197
198     int vdwkt = 0;
199     if (ic.vdwtype == evdwCUT)
200     {
201         switch (ic.vdw_modifier)
202         {
203             case eintmodNONE:
204             case eintmodPOTSHIFT:
205                 switch (nbatParams.comb_rule)
206                 {
207                     case ljcrGEOM: vdwkt = vdwktLJCUT_COMBGEOM; break;
208                     case ljcrLB:   vdwkt = vdwktLJCUT_COMBLB;   break;
209                     case ljcrNONE: vdwkt = vdwktLJCUT_COMBNONE; break;
210                     default:
211                         GMX_RELEASE_ASSERT(false, "Unknown combination rule");
212                 }
213                 break;
214             case eintmodFORCESWITCH:
215                 vdwkt = vdwktLJFORCESWITCH;
216                 break;
217             case eintmodPOTSWITCH:
218                 vdwkt = vdwktLJPOTSWITCH;
219                 break;
220             default:
221                 GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction modifier");
222         }
223     }
224     else if (ic.vdwtype == evdwPME)
225     {
226         if (ic.ljpme_comb_rule == eljpmeGEOM)
227         {
228             vdwkt = vdwktLJEWALDCOMBGEOM;
229         }
230         else
231         {
232             vdwkt = vdwktLJEWALDCOMBLB;
233             /* At setup we (should have) selected the C reference kernel */
234             GMX_RELEASE_ASSERT(kernelSetup.kernelType == Nbnxm::KernelType::Cpu4x4_PlainC, "Only the C reference nbnxn SIMD kernel supports LJ-PME with LB combination rules");
235         }
236     }
237     else
238     {
239         GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction type");
240     }
241
242     gmx::ArrayRef<const NbnxnPairlistCpu> pairlists = pairlistSet.cpuLists();
243
244     int gmx_unused                        nthreads = gmx_omp_nthreads_get(emntNonbonded);
245     wallcycle_sub_start(wcycle, ewcsNONBONDED_CLEAR);
246 #pragma omp parallel for schedule(static) num_threads(nthreads)
247     for (gmx::index nb = 0; nb < pairlists.ssize(); nb++)
248     {
249         // Presently, the kernels do not call C++ code that can throw,
250         // so no need for a try/catch pair in this OpenMP region.
251         nbnxn_atomdata_output_t *out = &nbat->out[nb];
252
253         if (clearF == enbvClearFYes)
254         {
255             clearForceBuffer(nbat, nb);
256
257             clear_fshift(out->fshift.data());
258         }
259
260         if (nb == 0)
261         {
262             wallcycle_sub_stop(wcycle, ewcsNONBONDED_CLEAR);
263             wallcycle_sub_start(wcycle, ewcsNONBONDED_KERNEL);
264         }
265
266         // TODO: Change to reference
267         const NbnxnPairlistCpu *pairlist = &pairlists[nb];
268
269         if (!(forceFlags & GMX_FORCE_ENERGY))
270         {
271             /* Don't calculate energies */
272             switch (kernelSetup.kernelType)
273             {
274                 case Nbnxm::KernelType::Cpu4x4_PlainC:
275                     nbnxn_kernel_noener_ref[coulkt][vdwkt](pairlist, nbat,
276                                                            &ic,
277                                                            shiftVectors,
278                                                            out);
279                     break;
280 #ifdef GMX_NBNXN_SIMD_2XNN
281                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
282                     nbnxm_kernel_noener_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
283                                                                  &ic,
284                                                                  shiftVectors,
285                                                                  out);
286                     break;
287 #endif
288 #ifdef GMX_NBNXN_SIMD_4XN
289                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
290                     nbnxm_kernel_noener_simd_4xm[coulkt][vdwkt](pairlist, nbat,
291                                                                 &ic,
292                                                                 shiftVectors,
293                                                                 out);
294                     break;
295 #endif
296                 default:
297                     GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
298             }
299         }
300         else if (out->Vvdw.size() == 1)
301         {
302             /* A single energy group (pair) */
303             out->Vvdw[0] = 0;
304             out->Vc[0]   = 0;
305
306             switch (kernelSetup.kernelType)
307             {
308                 case Nbnxm::KernelType::Cpu4x4_PlainC:
309                     nbnxn_kernel_ener_ref[coulkt][vdwkt](pairlist, nbat,
310                                                          &ic,
311                                                          shiftVectors,
312                                                          out);
313                     break;
314 #ifdef GMX_NBNXN_SIMD_2XNN
315                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
316                     nbnxm_kernel_ener_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
317                                                                &ic,
318                                                                shiftVectors,
319                                                                out);
320                     break;
321 #endif
322 #ifdef GMX_NBNXN_SIMD_4XN
323                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
324                     nbnxm_kernel_ener_simd_4xm[coulkt][vdwkt](pairlist, nbat,
325                                                               &ic,
326                                                               shiftVectors,
327                                                               out);
328                     break;
329 #endif
330                 default:
331                     GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
332             }
333         }
334         else
335         {
336             /* Calculate energy group contributions */
337             clearGroupEnergies(out);
338
339             int unrollj = 0;
340
341             switch (kernelSetup.kernelType)
342             {
343                 case Nbnxm::KernelType::Cpu4x4_PlainC:
344                     unrollj = c_nbnxnCpuIClusterSize;
345                     nbnxn_kernel_energrp_ref[coulkt][vdwkt](pairlist, nbat,
346                                                             &ic,
347                                                             shiftVectors,
348                                                             out);
349                     break;
350 #ifdef GMX_NBNXN_SIMD_2XNN
351                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
352                     unrollj = GMX_SIMD_REAL_WIDTH/2;
353                     nbnxm_kernel_energrp_simd_2xmm[coulkt][vdwkt](pairlist, nbat,
354                                                                   &ic,
355                                                                   shiftVectors,
356                                                                   out);
357                     break;
358 #endif
359 #ifdef GMX_NBNXN_SIMD_4XN
360                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
361                     unrollj = GMX_SIMD_REAL_WIDTH;
362                     nbnxm_kernel_energrp_simd_4xm[coulkt][vdwkt](pairlist, nbat,
363                                                                  &ic,
364                                                                  shiftVectors,
365                                                                  out);
366                     break;
367 #endif
368                 default:
369                     GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
370             }
371
372             if (kernelSetup.kernelType != Nbnxm::KernelType::Cpu4x4_PlainC)
373             {
374                 switch (unrollj)
375                 {
376                     case 2:
377                         reduceGroupEnergySimdBuffers<2>(nbatParams.nenergrp,
378                                                         nbatParams.neg_2log,
379                                                         out);
380                         break;
381                     case 4:
382                         reduceGroupEnergySimdBuffers<4>(nbatParams.nenergrp,
383                                                         nbatParams.neg_2log,
384                                                         out);
385                         break;
386                     case 8:
387                         reduceGroupEnergySimdBuffers<8>(nbatParams.nenergrp,
388                                                         nbatParams.neg_2log,
389                                                         out);
390                         break;
391                     default:
392                         GMX_RELEASE_ASSERT(false, "Unsupported j-unroll size");
393                 }
394             }
395         }
396     }
397     wallcycle_sub_stop(wcycle, ewcsNONBONDED_KERNEL);
398
399     if (forceFlags & GMX_FORCE_ENERGY)
400     {
401         reduce_energies_over_lists(nbat, pairlists.ssize(), vVdw, vCoulomb);
402     }
403 }
404
405 static void accountFlops(t_nrnb                           *nrnb,
406                          const PairlistSet                &pairlistSet,
407                          const nonbonded_verlet_t         &nbv,
408                          const interaction_const_t        &ic,
409                          const int                         forceFlags)
410 {
411     const bool usingGpuKernels = nbv.useGpu();
412
413     int        enr_nbnxn_kernel_ljc;
414     if (EEL_RF(ic.eeltype) || ic.eeltype == eelCUT)
415     {
416         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
417     }
418     else if ((!usingGpuKernels && nbv.kernelSetup().ewaldExclusionType == Nbnxm::EwaldExclusionType::Analytical) ||
419              (usingGpuKernels && Nbnxm::gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
420     {
421         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
422     }
423     else
424     {
425         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
426     }
427     int enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
428     if (forceFlags & GMX_FORCE_ENERGY)
429     {
430         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
431         enr_nbnxn_kernel_ljc += 1;
432         enr_nbnxn_kernel_lj  += 1;
433     }
434
435     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc,
436              pairlistSet.natpair_ljq_);
437     inc_nrnb(nrnb, enr_nbnxn_kernel_lj,
438              pairlistSet.natpair_lj_);
439     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
440     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc-eNR_NBNXN_LJ_RF+eNR_NBNXN_RF,
441              pairlistSet.natpair_q_);
442
443     const bool calcEnergy = ((forceFlags & GMX_FORCE_ENERGY) != 0);
444     if (ic.vdw_modifier == eintmodFORCESWITCH)
445     {
446         /* We add up the switch cost separately */
447         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW + (calcEnergy ? 1 : 0),
448                  pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
449     }
450     if (ic.vdw_modifier == eintmodPOTSWITCH)
451     {
452         /* We add up the switch cost separately */
453         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW + (calcEnergy ? 1 : 0),
454                  pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
455     }
456     if (ic.vdwtype == evdwPME)
457     {
458         /* We add up the LJ Ewald cost separately */
459         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD + (calcEnergy ? 1 : 0),
460                  pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
461     }
462 }
463
464 void
465 nonbonded_verlet_t::dispatchNonbondedKernel(Nbnxm::InteractionLocality iLocality,
466                                             const interaction_const_t &ic,
467                                             int                        forceFlags,
468                                             int                        clearF,
469                                             const t_forcerec          &fr,
470                                             gmx_enerdata_t            *enerd,
471                                             t_nrnb                    *nrnb)
472 {
473     const PairlistSet &pairlistSet = pairlistSets().pairlistSet(iLocality);
474
475     switch (kernelSetup().kernelType)
476     {
477         case Nbnxm::KernelType::Cpu4x4_PlainC:
478         case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
479         case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
480             nbnxn_kernel_cpu(pairlistSet,
481                              kernelSetup(),
482                              nbat.get(),
483                              ic,
484                              fr.shift_vec,
485                              forceFlags,
486                              clearF,
487                              enerd->grpp.ener[egCOULSR].data(),
488                              fr.bBHAM ?
489                              enerd->grpp.ener[egBHAMSR].data() :
490                              enerd->grpp.ener[egLJSR].data(),
491                              wcycle_);
492             break;
493
494         case Nbnxm::KernelType::Gpu8x8x8:
495             Nbnxm::gpu_launch_kernel(gpu_nbv, forceFlags, iLocality);
496             break;
497
498         case Nbnxm::KernelType::Cpu8x8x8_PlainC:
499             nbnxn_kernel_gpu_ref(pairlistSet.gpuList(),
500                                  nbat.get(), &ic,
501                                  fr.shift_vec,
502                                  forceFlags,
503                                  clearF,
504                                  nbat->out[0].f,
505                                  nbat->out[0].fshift.data(),
506                                  enerd->grpp.ener[egCOULSR].data(),
507                                  fr.bBHAM ?
508                                  enerd->grpp.ener[egBHAMSR].data() :
509                                  enerd->grpp.ener[egLJSR].data());
510             break;
511
512         default:
513             GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
514
515     }
516
517     accountFlops(nrnb, pairlistSet, *this, ic, forceFlags);
518 }
519
520 void
521 nonbonded_verlet_t::dispatchFreeEnergyKernel(Nbnxm::InteractionLocality  iLocality,
522                                              const t_forcerec           *fr,
523                                              rvec                        x[],
524                                              gmx::ForceWithShiftForces  *forceWithShiftForces,
525                                              const t_mdatoms            &mdatoms,
526                                              t_lambda                   *fepvals,
527                                              real                       *lambda,
528                                              gmx_enerdata_t             *enerd,
529                                              const int                   forceFlags,
530                                              t_nrnb                     *nrnb)
531 {
532     const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists();
533
534     /* When the first list is empty, all are empty and there is nothing to do */
535     if (!pairlistSets().params().haveFep || nbl_fep[0]->nrj == 0)
536     {
537         return;
538     }
539
540     int donb_flags = 0;
541     /* Add short-range interactions */
542     donb_flags |= GMX_NONBONDED_DO_SR;
543
544     /* Currently all group scheme kernels always calculate (shift-)forces */
545     if (forceFlags & GMX_FORCE_FORCES)
546     {
547         donb_flags |= GMX_NONBONDED_DO_FORCE;
548     }
549     if (forceFlags & GMX_FORCE_VIRIAL)
550     {
551         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
552     }
553     if (forceFlags & GMX_FORCE_ENERGY)
554     {
555         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
556     }
557
558     nb_kernel_data_t kernel_data;
559     real             dvdl_nb[efptNR] = { 0 };
560     kernel_data.flags  = donb_flags;
561     kernel_data.lambda = lambda;
562     kernel_data.dvdl   = dvdl_nb;
563
564     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR].data();
565     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR].data();
566
567     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(), "Number of lists should be same as number of NB threads");
568
569     wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP);
570 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
571     for (gmx::index th = 0; th < nbl_fep.ssize(); th++)
572     {
573         try
574         {
575             gmx_nb_free_energy_kernel(nbl_fep[th].get(),
576                                       x, forceWithShiftForces,
577                                       fr, &mdatoms, &kernel_data, nrnb);
578         }
579         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
580     }
581
582     if (fepvals->sc_alpha != 0)
583     {
584         enerd->dvdl_nonlin[efptVDW]  += dvdl_nb[efptVDW];
585         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
586     }
587     else
588     {
589         enerd->dvdl_lin[efptVDW]  += dvdl_nb[efptVDW];
590         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
591     }
592
593     /* If we do foreign lambda and we have soft-core interactions
594      * we have to recalculate the (non-linear) energies contributions.
595      */
596     if (fepvals->n_lambda > 0 && (forceFlags & GMX_FORCE_DHDL) && fepvals->sc_alpha != 0)
597     {
598         real lam_i[efptNR];
599         kernel_data.flags          = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) | GMX_NONBONDED_DO_FOREIGNLAMBDA;
600         kernel_data.lambda         = lam_i;
601         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR].data();
602         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR].data();
603         /* Note that we add to kernel_data.dvdl, but ignore the result */
604
605         for (size_t i = 0; i < enerd->enerpart_lambda.size(); i++)
606         {
607             for (int j = 0; j < efptNR; j++)
608             {
609                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i-1]);
610             }
611             reset_foreign_enerdata(enerd);
612 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
613             for (gmx::index th = 0; th < nbl_fep.ssize(); th++)
614             {
615                 try
616                 {
617                     gmx_nb_free_energy_kernel(nbl_fep[th].get(),
618                                               x, forceWithShiftForces,
619                                               fr, &mdatoms, &kernel_data, nrnb);
620                 }
621                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
622             }
623
624             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
625             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
626         }
627     }
628     wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP);
629 }