71b9d87a17831c3193d7c64c0437eb25ec38ad9a
[alexxy/gromacs.git] / src / gromacs / nbnxm / kerneldispatch.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
5  * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
6  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
7  * and including many others, as listed in the AUTHORS file in the
8  * top-level source directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36
37 #include "gmxpre.h"
38
39 #include "gromacs/gmxlib/nrnb.h"
40 #include "gromacs/gmxlib/nonbonded/nb_free_energy.h"
41 #include "gromacs/gmxlib/nonbonded/nb_kernel.h"
42 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
43 #include "gromacs/math/vectypes.h"
44 #include "gromacs/mdlib/enerdata_utils.h"
45 #include "gromacs/mdlib/force.h"
46 #include "gromacs/mdlib/gmx_omp_nthreads.h"
47 #include "gromacs/mdtypes/enerdata.h"
48 #include "gromacs/mdtypes/forceoutput.h"
49 #include "gromacs/mdtypes/forcerec.h"
50 #include "gromacs/mdtypes/inputrec.h"
51 #include "gromacs/mdtypes/interaction_const.h"
52 #include "gromacs/mdtypes/md_enums.h"
53 #include "gromacs/mdtypes/mdatom.h"
54 #include "gromacs/mdtypes/simulation_workload.h"
55 #include "gromacs/nbnxm/gpu_data_mgmt.h"
56 #include "gromacs/nbnxm/nbnxm.h"
57 #include "gromacs/simd/simd.h"
58 #include "gromacs/timing/wallcycle.h"
59 #include "gromacs/utility/gmxassert.h"
60 #include "gromacs/utility/real.h"
61
62 #include "kernel_common.h"
63 #include "nbnxm_gpu.h"
64 #include "nbnxm_simd.h"
65 #include "pairlistset.h"
66 #include "pairlistsets.h"
67 #include "kernels_reference/kernel_gpu_ref.h"
68 #define INCLUDE_KERNELFUNCTION_TABLES
69 #include "kernels_reference/kernel_ref.h"
70 #ifdef GMX_NBNXN_SIMD_2XNN
71 #    include "kernels_simd_2xmm/kernels.h"
72 #endif
73 #ifdef GMX_NBNXN_SIMD_4XN
74 #    include "kernels_simd_4xm/kernels.h"
75 #endif
76 #undef INCLUDE_FUNCTION_TABLES
77
78 /*! \brief Clears the energy group output buffers
79  *
80  * \param[in,out] out  nbnxn kernel output struct
81  */
82 static void clearGroupEnergies(nbnxn_atomdata_output_t* out)
83 {
84     std::fill(out->Vvdw.begin(), out->Vvdw.end(), 0.0_real);
85     std::fill(out->Vc.begin(), out->Vc.end(), 0.0_real);
86     std::fill(out->VSvdw.begin(), out->VSvdw.end(), 0.0_real);
87     std::fill(out->VSc.begin(), out->VSc.end(), 0.0_real);
88 }
89
90 /*! \brief Reduce the group-pair energy buffers produced by a SIMD kernel
91  * to single terms in the output buffers.
92  *
93  * The SIMD kernels produce a large number of energy buffer in SIMD registers
94  * to avoid scattered reads and writes.
95  *
96  * \tparam        unrollj         The unroll size for j-particles in the SIMD kernel
97  * \param[in]     numGroups       The number of energy groups
98  * \param[in]     numGroups_2log  Log2 of numGroups, rounded up
99  * \param[in,out] out             Struct with energy buffers
100  */
101 template<int unrollj>
102 static void reduceGroupEnergySimdBuffers(int numGroups, int numGroups_2log, nbnxn_atomdata_output_t* out)
103 {
104     const int unrollj_half = unrollj / 2;
105     /* Energies are stored in SIMD registers with size 2^numGroups_2log */
106     const int numGroupsStorage = (1 << numGroups_2log);
107
108     const real* gmx_restrict vVdwSimd     = out->VSvdw.data();
109     const real* gmx_restrict vCoulombSimd = out->VSc.data();
110     real* gmx_restrict vVdw               = out->Vvdw.data();
111     real* gmx_restrict vCoulomb           = out->Vc.data();
112
113     /* The size of the SIMD energy group buffer array is:
114      * numGroups*numGroups*numGroupsStorage*unrollj_half*simd_width
115      */
116     for (int i = 0; i < numGroups; i++)
117     {
118         for (int j1 = 0; j1 < numGroups; j1++)
119         {
120             for (int j0 = 0; j0 < numGroups; j0++)
121             {
122                 int c = ((i * numGroups + j1) * numGroupsStorage + j0) * unrollj_half * unrollj;
123                 for (int s = 0; s < unrollj_half; s++)
124                 {
125                     vVdw[i * numGroups + j0] += vVdwSimd[c + 0];
126                     vVdw[i * numGroups + j1] += vVdwSimd[c + 1];
127                     vCoulomb[i * numGroups + j0] += vCoulombSimd[c + 0];
128                     vCoulomb[i * numGroups + j1] += vCoulombSimd[c + 1];
129                     c += unrollj + 2;
130                 }
131             }
132         }
133     }
134 }
135
136 /*! \brief Dispatches the non-bonded N versus M atom cluster CPU kernels.
137  *
138  * OpenMP parallelization is performed within this function.
139  * Energy reduction, but not force and shift force reduction, is performed
140  * within this function.
141  *
142  * \param[in]     pairlistSet   Pairlists with local or non-local interactions to compute
143  * \param[in]     kernelSetup   The non-bonded kernel setup
144  * \param[in,out] nbat          The atomdata for the interactions
145  * \param[in]     ic            Non-bonded interaction constants
146  * \param[in]     shiftVectors  The PBC shift vectors
147  * \param[in]     stepWork      Flags that tell what to compute
148  * \param[in]     clearF        Enum that tells if to clear the force output buffer
149  * \param[out]    vCoulomb      Output buffer for Coulomb energies
150  * \param[out]    vVdw          Output buffer for Van der Waals energies
151  * \param[in]     wcycle        Pointer to cycle counting data structure.
152  */
153 static void nbnxn_kernel_cpu(const PairlistSet&         pairlistSet,
154                              const Nbnxm::KernelSetup&  kernelSetup,
155                              nbnxn_atomdata_t*          nbat,
156                              const interaction_const_t& ic,
157                              rvec*                      shiftVectors,
158                              const gmx::StepWorkload&   stepWork,
159                              int                        clearF,
160                              real*                      vCoulomb,
161                              real*                      vVdw,
162                              gmx_wallcycle*             wcycle)
163 {
164
165     int coulkt;
166     if (EEL_RF(ic.eeltype) || ic.eeltype == eelCUT)
167     {
168         coulkt = coulktRF;
169     }
170     else
171     {
172         if (kernelSetup.ewaldExclusionType == Nbnxm::EwaldExclusionType::Table)
173         {
174             if (ic.rcoulomb == ic.rvdw)
175             {
176                 coulkt = coulktTAB;
177             }
178             else
179             {
180                 coulkt = coulktTAB_TWIN;
181             }
182         }
183         else
184         {
185             if (ic.rcoulomb == ic.rvdw)
186             {
187                 coulkt = coulktEWALD;
188             }
189             else
190             {
191                 coulkt = coulktEWALD_TWIN;
192             }
193         }
194     }
195
196     const nbnxn_atomdata_t::Params& nbatParams = nbat->params();
197
198     int vdwkt = 0;
199     if (ic.vdwtype == evdwCUT)
200     {
201         switch (ic.vdw_modifier)
202         {
203             case eintmodNONE:
204             case eintmodPOTSHIFT:
205                 switch (nbatParams.comb_rule)
206                 {
207                     case ljcrGEOM: vdwkt = vdwktLJCUT_COMBGEOM; break;
208                     case ljcrLB: vdwkt = vdwktLJCUT_COMBLB; break;
209                     case ljcrNONE: vdwkt = vdwktLJCUT_COMBNONE; break;
210                     default: GMX_RELEASE_ASSERT(false, "Unknown combination rule");
211                 }
212                 break;
213             case eintmodFORCESWITCH: vdwkt = vdwktLJFORCESWITCH; break;
214             case eintmodPOTSWITCH: vdwkt = vdwktLJPOTSWITCH; break;
215             default: GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction modifier");
216         }
217     }
218     else if (ic.vdwtype == evdwPME)
219     {
220         if (ic.ljpme_comb_rule == eljpmeGEOM)
221         {
222             vdwkt = vdwktLJEWALDCOMBGEOM;
223         }
224         else
225         {
226             vdwkt = vdwktLJEWALDCOMBLB;
227             /* At setup we (should have) selected the C reference kernel */
228             GMX_RELEASE_ASSERT(kernelSetup.kernelType == Nbnxm::KernelType::Cpu4x4_PlainC,
229                                "Only the C reference nbnxn SIMD kernel supports LJ-PME with LB "
230                                "combination rules");
231         }
232     }
233     else
234     {
235         GMX_RELEASE_ASSERT(false, "Unsupported VdW interaction type");
236     }
237
238     gmx::ArrayRef<const NbnxnPairlistCpu> pairlists = pairlistSet.cpuLists();
239
240     int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded);
241     wallcycle_sub_start(wcycle, ewcsNONBONDED_CLEAR);
242 #pragma omp parallel for schedule(static) num_threads(nthreads)
243     for (gmx::index nb = 0; nb < pairlists.ssize(); nb++)
244     {
245         // Presently, the kernels do not call C++ code that can throw,
246         // so no need for a try/catch pair in this OpenMP region.
247         nbnxn_atomdata_output_t* out = &nbat->out[nb];
248
249         if (clearF == enbvClearFYes)
250         {
251             clearForceBuffer(nbat, nb);
252
253             clear_fshift(out->fshift.data());
254         }
255
256         if (nb == 0)
257         {
258             wallcycle_sub_stop(wcycle, ewcsNONBONDED_CLEAR);
259             wallcycle_sub_start(wcycle, ewcsNONBONDED_KERNEL);
260         }
261
262         // TODO: Change to reference
263         const NbnxnPairlistCpu* pairlist = &pairlists[nb];
264
265         if (!stepWork.computeEnergy)
266         {
267             /* Don't calculate energies */
268             switch (kernelSetup.kernelType)
269             {
270                 case Nbnxm::KernelType::Cpu4x4_PlainC:
271                     nbnxn_kernel_noener_ref[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
272                     break;
273 #ifdef GMX_NBNXN_SIMD_2XNN
274                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
275                     nbnxm_kernel_noener_simd_2xmm[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
276                     break;
277 #endif
278 #ifdef GMX_NBNXN_SIMD_4XN
279                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
280                     nbnxm_kernel_noener_simd_4xm[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
281                     break;
282 #endif
283                 default: GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
284             }
285         }
286         else if (out->Vvdw.size() == 1)
287         {
288             /* A single energy group (pair) */
289             out->Vvdw[0] = 0;
290             out->Vc[0]   = 0;
291
292             switch (kernelSetup.kernelType)
293             {
294                 case Nbnxm::KernelType::Cpu4x4_PlainC:
295                     nbnxn_kernel_ener_ref[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
296                     break;
297 #ifdef GMX_NBNXN_SIMD_2XNN
298                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
299                     nbnxm_kernel_ener_simd_2xmm[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
300                     break;
301 #endif
302 #ifdef GMX_NBNXN_SIMD_4XN
303                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
304                     nbnxm_kernel_ener_simd_4xm[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
305                     break;
306 #endif
307                 default: GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
308             }
309         }
310         else
311         {
312             /* Calculate energy group contributions */
313             clearGroupEnergies(out);
314
315             int unrollj = 0;
316
317             switch (kernelSetup.kernelType)
318             {
319                 case Nbnxm::KernelType::Cpu4x4_PlainC:
320                     unrollj = c_nbnxnCpuIClusterSize;
321                     nbnxn_kernel_energrp_ref[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
322                     break;
323 #ifdef GMX_NBNXN_SIMD_2XNN
324                 case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
325                     unrollj = GMX_SIMD_REAL_WIDTH / 2;
326                     nbnxm_kernel_energrp_simd_2xmm[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
327                     break;
328 #endif
329 #ifdef GMX_NBNXN_SIMD_4XN
330                 case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
331                     unrollj = GMX_SIMD_REAL_WIDTH;
332                     nbnxm_kernel_energrp_simd_4xm[coulkt][vdwkt](pairlist, nbat, &ic, shiftVectors, out);
333                     break;
334 #endif
335                 default: GMX_RELEASE_ASSERT(false, "Unsupported kernel architecture");
336             }
337
338             if (kernelSetup.kernelType != Nbnxm::KernelType::Cpu4x4_PlainC)
339             {
340                 switch (unrollj)
341                 {
342                     case 2:
343                         reduceGroupEnergySimdBuffers<2>(nbatParams.nenergrp, nbatParams.neg_2log, out);
344                         break;
345                     case 4:
346                         reduceGroupEnergySimdBuffers<4>(nbatParams.nenergrp, nbatParams.neg_2log, out);
347                         break;
348                     case 8:
349                         reduceGroupEnergySimdBuffers<8>(nbatParams.nenergrp, nbatParams.neg_2log, out);
350                         break;
351                     default: GMX_RELEASE_ASSERT(false, "Unsupported j-unroll size");
352                 }
353             }
354         }
355     }
356     wallcycle_sub_stop(wcycle, ewcsNONBONDED_KERNEL);
357
358     if (stepWork.computeEnergy)
359     {
360         reduce_energies_over_lists(nbat, pairlists.ssize(), vVdw, vCoulomb);
361     }
362 }
363
364 static void accountFlops(t_nrnb*                    nrnb,
365                          const PairlistSet&         pairlistSet,
366                          const nonbonded_verlet_t&  nbv,
367                          const interaction_const_t& ic,
368                          const gmx::StepWorkload&   stepWork)
369 {
370     const bool usingGpuKernels = nbv.useGpu();
371
372     int enr_nbnxn_kernel_ljc;
373     if (EEL_RF(ic.eeltype) || ic.eeltype == eelCUT)
374     {
375         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_RF;
376     }
377     else if ((!usingGpuKernels && nbv.kernelSetup().ewaldExclusionType == Nbnxm::EwaldExclusionType::Analytical)
378              || (usingGpuKernels && Nbnxm::gpu_is_kernel_ewald_analytical(nbv.gpu_nbv)))
379     {
380         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_EWALD;
381     }
382     else
383     {
384         enr_nbnxn_kernel_ljc = eNR_NBNXN_LJ_TAB;
385     }
386     int enr_nbnxn_kernel_lj = eNR_NBNXN_LJ;
387     if (stepWork.computeEnergy)
388     {
389         /* In eNR_??? the nbnxn F+E kernels are always the F kernel + 1 */
390         enr_nbnxn_kernel_ljc += 1;
391         enr_nbnxn_kernel_lj += 1;
392     }
393
394     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc, pairlistSet.natpair_ljq_);
395     inc_nrnb(nrnb, enr_nbnxn_kernel_lj, pairlistSet.natpair_lj_);
396     /* The Coulomb-only kernels are offset -eNR_NBNXN_LJ_RF+eNR_NBNXN_RF */
397     inc_nrnb(nrnb, enr_nbnxn_kernel_ljc - eNR_NBNXN_LJ_RF + eNR_NBNXN_RF, pairlistSet.natpair_q_);
398
399     if (ic.vdw_modifier == eintmodFORCESWITCH)
400     {
401         /* We add up the switch cost separately */
402         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_FSW + (stepWork.computeEnergy ? 1 : 0),
403                  pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
404     }
405     if (ic.vdw_modifier == eintmodPOTSWITCH)
406     {
407         /* We add up the switch cost separately */
408         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_PSW + (stepWork.computeEnergy ? 1 : 0),
409                  pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
410     }
411     if (ic.vdwtype == evdwPME)
412     {
413         /* We add up the LJ Ewald cost separately */
414         inc_nrnb(nrnb, eNR_NBNXN_ADD_LJ_EWALD + (stepWork.computeEnergy ? 1 : 0),
415                  pairlistSet.natpair_ljq_ + pairlistSet.natpair_lj_);
416     }
417 }
418
419 void nonbonded_verlet_t::dispatchNonbondedKernel(gmx::InteractionLocality   iLocality,
420                                                  const interaction_const_t& ic,
421                                                  const gmx::StepWorkload&   stepWork,
422                                                  int                        clearF,
423                                                  const t_forcerec&          fr,
424                                                  gmx_enerdata_t*            enerd,
425                                                  t_nrnb*                    nrnb)
426 {
427     const PairlistSet& pairlistSet = pairlistSets().pairlistSet(iLocality);
428
429     switch (kernelSetup().kernelType)
430     {
431         case Nbnxm::KernelType::Cpu4x4_PlainC:
432         case Nbnxm::KernelType::Cpu4xN_Simd_4xN:
433         case Nbnxm::KernelType::Cpu4xN_Simd_2xNN:
434             nbnxn_kernel_cpu(pairlistSet, kernelSetup(), nbat.get(), ic, fr.shift_vec, stepWork,
435                              clearF, enerd->grpp.ener[egCOULSR].data(),
436                              fr.bBHAM ? enerd->grpp.ener[egBHAMSR].data() : enerd->grpp.ener[egLJSR].data(),
437                              wcycle_);
438             break;
439
440         case Nbnxm::KernelType::Gpu8x8x8:
441             Nbnxm::gpu_launch_kernel(gpu_nbv, stepWork, iLocality);
442             break;
443
444         case Nbnxm::KernelType::Cpu8x8x8_PlainC:
445             nbnxn_kernel_gpu_ref(
446                     pairlistSet.gpuList(), nbat.get(), &ic, fr.shift_vec, stepWork, clearF,
447                     nbat->out[0].f, nbat->out[0].fshift.data(), enerd->grpp.ener[egCOULSR].data(),
448                     fr.bBHAM ? enerd->grpp.ener[egBHAMSR].data() : enerd->grpp.ener[egLJSR].data());
449             break;
450
451         default: GMX_RELEASE_ASSERT(false, "Invalid nonbonded kernel type passed!");
452     }
453
454     accountFlops(nrnb, pairlistSet, *this, ic, stepWork);
455 }
456
457 void nonbonded_verlet_t::dispatchFreeEnergyKernel(gmx::InteractionLocality   iLocality,
458                                                   const t_forcerec*          fr,
459                                                   rvec                       x[],
460                                                   gmx::ForceWithShiftForces* forceWithShiftForces,
461                                                   const t_mdatoms&           mdatoms,
462                                                   t_lambda*                  fepvals,
463                                                   real*                      lambda,
464                                                   gmx_enerdata_t*            enerd,
465                                                   const gmx::StepWorkload&   stepWork,
466                                                   t_nrnb*                    nrnb)
467 {
468     const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists();
469
470     /* When the first list is empty, all are empty and there is nothing to do */
471     if (!pairlistSets().params().haveFep || nbl_fep[0]->nrj == 0)
472     {
473         return;
474     }
475
476     int donb_flags = 0;
477     /* Add short-range interactions */
478     donb_flags |= GMX_NONBONDED_DO_SR;
479
480     if (stepWork.computeForces)
481     {
482         donb_flags |= GMX_NONBONDED_DO_FORCE;
483     }
484     if (stepWork.computeVirial)
485     {
486         donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE;
487     }
488     if (stepWork.computeEnergy)
489     {
490         donb_flags |= GMX_NONBONDED_DO_POTENTIAL;
491     }
492
493     nb_kernel_data_t kernel_data;
494     real             dvdl_nb[efptNR] = { 0 };
495     kernel_data.flags                = donb_flags;
496     kernel_data.lambda               = lambda;
497     kernel_data.dvdl                 = dvdl_nb;
498
499     kernel_data.energygrp_elec = enerd->grpp.ener[egCOULSR].data();
500     kernel_data.energygrp_vdw  = enerd->grpp.ener[egLJSR].data();
501
502     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(),
503                "Number of lists should be same as number of NB threads");
504
505     wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP);
506 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
507     for (gmx::index th = 0; th < nbl_fep.ssize(); th++)
508     {
509         try
510         {
511             gmx_nb_free_energy_kernel(nbl_fep[th].get(), x, forceWithShiftForces, fr, &mdatoms,
512                                       &kernel_data, nrnb);
513         }
514         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
515     }
516
517     if (fepvals->sc_alpha != 0)
518     {
519         enerd->dvdl_nonlin[efptVDW] += dvdl_nb[efptVDW];
520         enerd->dvdl_nonlin[efptCOUL] += dvdl_nb[efptCOUL];
521     }
522     else
523     {
524         enerd->dvdl_lin[efptVDW] += dvdl_nb[efptVDW];
525         enerd->dvdl_lin[efptCOUL] += dvdl_nb[efptCOUL];
526     }
527
528     /* If we do foreign lambda and we have soft-core interactions
529      * we have to recalculate the (non-linear) energies contributions.
530      */
531     if (fepvals->n_lambda > 0 && stepWork.computeDhdl && fepvals->sc_alpha != 0)
532     {
533         real lam_i[efptNR];
534         kernel_data.flags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE))
535                             | GMX_NONBONDED_DO_FOREIGNLAMBDA;
536         kernel_data.lambda         = lam_i;
537         kernel_data.dvdl           = dvdl_nb;
538         kernel_data.energygrp_elec = enerd->foreign_grpp.ener[egCOULSR].data();
539         kernel_data.energygrp_vdw  = enerd->foreign_grpp.ener[egLJSR].data();
540
541         for (size_t i = 0; i < enerd->enerpart_lambda.size(); i++)
542         {
543             std::fill(std::begin(dvdl_nb), std::end(dvdl_nb), 0);
544             for (int j = 0; j < efptNR; j++)
545             {
546                 lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i - 1]);
547             }
548             reset_foreign_enerdata(enerd);
549 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
550             for (gmx::index th = 0; th < nbl_fep.ssize(); th++)
551             {
552                 try
553                 {
554                     gmx_nb_free_energy_kernel(nbl_fep[th].get(), x, forceWithShiftForces, fr,
555                                               &mdatoms, &kernel_data, nrnb);
556                 }
557                 GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
558             }
559
560             sum_epot(&(enerd->foreign_grpp), enerd->foreign_term);
561             enerd->enerpart_lambda[i] += enerd->foreign_term[F_EPOT];
562             enerd->dhdlLambda[i] += dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
563         }
564     }
565     else
566     {
567         for (size_t i = 0; i < enerd->enerpart_lambda.size(); i++)
568         {
569             enerd->dhdlLambda[i] += dvdl_nb[efptVDW] + dvdl_nb[efptCOUL];
570         }
571     }
572     wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP);
573 }