33e485d191f139b7152b1d28cf80e66f5d1520b5
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_free_energy.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #include "gmxpre.h"
39
40 #include "nb_free_energy.h"
41
42 #include "config.h"
43
44 #include <cmath>
45 #include <set>
46
47 #include <algorithm>
48
49 #include "gromacs/gmxlib/nrnb.h"
50 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
51 #include "gromacs/math/arrayrefwithpadding.h"
52 #include "gromacs/math/functions.h"
53 #include "gromacs/math/vec.h"
54 #include "gromacs/mdtypes/forceoutput.h"
55 #include "gromacs/mdtypes/forcerec.h"
56 #include "gromacs/mdtypes/interaction_const.h"
57 #include "gromacs/mdtypes/md_enums.h"
58 #include "gromacs/mdtypes/mdatom.h"
59 #include "gromacs/mdtypes/nblist.h"
60 #include "gromacs/pbcutil/ishift.h"
61 #include "gromacs/simd/simd.h"
62 #include "gromacs/simd/simd_math.h"
63 #include "gromacs/utility/fatalerror.h"
64 #include "gromacs/utility/arrayref.h"
65
66 #include "nb_softcore.h"
67
68 //! Scalar (non-SIMD) data types.
69 struct ScalarDataTypes
70 {
71     using RealType = real; //!< The data type to use as real.
72     using IntType  = int;  //!< The data type to use as int.
73     using BoolType = bool; //!< The data type to use as bool for real value comparison.
74     static constexpr int simdRealWidth = 1; //!< The width of the RealType.
75     static constexpr int simdIntWidth  = 1; //!< The width of the IntType.
76 };
77
78 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
79 //! SIMD data types.
80 struct SimdDataTypes
81 {
82     using RealType = gmx::SimdReal;  //!< The data type to use as real.
83     using IntType  = gmx::SimdInt32; //!< The data type to use as int.
84     using BoolType = gmx::SimdBool;  //!< The data type to use as bool for real value comparison.
85     static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH; //!< The width of the RealType.
86 #    if GMX_SIMD_HAVE_DOUBLE && GMX_DOUBLE
87     static constexpr int simdIntWidth = GMX_SIMD_DINT32_WIDTH; //!< The width of the IntType.
88 #    else
89     static constexpr int simdIntWidth = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType.
90 #    endif
91 };
92 #endif
93
94 /*! \brief Lower limit for square interaction distances in nonbonded kernels.
95  *
96  * This is a mimimum on r^2 to avoid overflows when computing r^6.
97  * This will only affect results for soft-cored interaction at distances smaller
98  * than 1e-6 and will limit extremely high foreign energies for overlapping atoms.
99  * Note that we could use a somewhat smaller minimum in double precision.
100  * But because invsqrt in double precision can use single precision, this number
101  * can not be much smaller, we use the same number for simplicity.
102  */
103 constexpr real c_minDistanceSquared = 1.0e-12_real;
104
105 /*! \brief Higher limit for r^-6 used for Lennard-Jones interactions
106  *
107  * This is needed to avoid overflow of LJ energy and force terms for excluded
108  * atoms and foreign energies of hard-core states of overlapping atoms.
109  * Note that in single precision this value leaves room for C12 coefficients up to 3.4e8.
110  */
111 constexpr real c_maxRInvSix = 1.0e15_real;
112
113 template<bool computeForces, class RealType>
114 static inline void
115 pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType gmx_unused* force)
116 {
117     const RealType brsq = rSq * beta * beta;
118     if constexpr (computeForces)
119     {
120         *force = -brsq * beta * gmx::pmeForceCorrection(brsq);
121     }
122     *pot = beta * gmx::pmePotentialCorrection(brsq);
123 }
124
125 template<bool computeForces, class RealType, class BoolType>
126 static inline void pmeLJCorrectionVF(const RealType rInv,
127                                      const RealType rSq,
128                                      const real     ewaldLJCoeffSq,
129                                      const real     ewaldLJCoeffSixDivSix,
130                                      RealType*      pot,
131                                      RealType gmx_unused* force,
132                                      const BoolType       mask,
133                                      const BoolType       bIiEqJnr)
134 {
135     // We mask rInv to get zero force and potential for masked out pair interactions
136     const RealType rInvSq  = rInv * rInv;
137     const RealType rInvSix = rInvSq * rInvSq * rInvSq;
138     // Mask rSq to avoid underflow in exp()
139     const RealType coeffSqRSq       = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask);
140     const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq);
141     const RealType poly             = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq;
142     if constexpr (computeForces)
143     {
144         *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
145         *force = *force * rInvSq;
146     }
147     // The self interaction is the limit for r -> 0 which we need to compute separately
148     *pot = gmx::blend(
149             rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr);
150 }
151
152 //! Computes r^(1/6) and 1/r^(1/6)
153 template<class RealType>
154 static inline void sixthRoot(const RealType r, RealType* sixthRoot, RealType* invSixthRoot)
155 {
156     RealType cbrtRes = gmx::cbrt(r);
157     *invSixthRoot    = gmx::invsqrt(cbrtRes);
158     *sixthRoot       = gmx::inv(*invSixthRoot);
159 }
160
161 template<class RealType>
162 static inline RealType calculateRinv6(const RealType rInvV)
163 {
164     RealType rInv6 = rInvV * rInvV;
165     return (rInv6 * rInv6 * rInv6);
166 }
167
168 template<class RealType>
169 static inline RealType calculateVdw6(const RealType c6, const RealType rInv6)
170 {
171     return (c6 * rInv6);
172 }
173
174 template<class RealType>
175 static inline RealType calculateVdw12(const RealType c12, const RealType rInv6)
176 {
177     return (c12 * rInv6 * rInv6);
178 }
179
180 /* reaction-field electrostatics */
181 template<class RealType>
182 static inline RealType reactionFieldScalarForce(const RealType qq,
183                                                 const RealType rInv,
184                                                 const RealType r,
185                                                 const real     krf,
186                                                 const real     two)
187 {
188     return (qq * (rInv - two * krf * r * r));
189 }
190 template<class RealType>
191 static inline RealType reactionFieldPotential(const RealType qq,
192                                               const RealType rInv,
193                                               const RealType r,
194                                               const real     krf,
195                                               const real     potentialShift)
196 {
197     return (qq * (rInv + krf * r * r - potentialShift));
198 }
199
200 /* Ewald electrostatics */
201 template<class RealType>
202 static inline RealType ewaldScalarForce(const RealType coulomb, const RealType rInv)
203 {
204     return (coulomb * rInv);
205 }
206 template<class RealType>
207 static inline RealType ewaldPotential(const RealType coulomb, const RealType rInv, const real potentialShift)
208 {
209     return (coulomb * (rInv - potentialShift));
210 }
211
212 /* cutoff LJ */
213 template<class RealType>
214 static inline RealType lennardJonesScalarForce(const RealType v6, const RealType v12)
215 {
216     return (v12 - v6);
217 }
218 template<class RealType>
219 static inline RealType lennardJonesPotential(const RealType v6,
220                                              const RealType v12,
221                                              const RealType c6,
222                                              const RealType c12,
223                                              const real     repulsionShift,
224                                              const real     dispersionShift,
225                                              const real     oneSixth,
226                                              const real     oneTwelfth)
227 {
228     return ((v12 + c12 * repulsionShift) * oneTwelfth - (v6 + c6 * dispersionShift) * oneSixth);
229 }
230
231 /* Ewald LJ */
232 template<class RealType>
233 static inline RealType ewaldLennardJonesGridSubtract(const RealType c6grid,
234                                                      const real     potentialShift,
235                                                      const real     oneSixth)
236 {
237     return (c6grid * potentialShift * oneSixth);
238 }
239
240 /* LJ Potential switch */
241 template<class RealType, class BoolType>
242 static inline RealType potSwitchScalarForceMod(const RealType fScalarInp,
243                                                const RealType potential,
244                                                const RealType sw,
245                                                const RealType r,
246                                                const RealType dsw,
247                                                const BoolType mask)
248 {
249     /* The mask should select on rV < rVdw */
250     return (gmx::selectByMask(fScalarInp * sw - r * potential * dsw, mask));
251 }
252 template<class RealType, class BoolType>
253 static inline RealType potSwitchPotentialMod(const RealType potentialInp, const RealType sw, const BoolType mask)
254 {
255     /* The mask should select on rV < rVdw */
256     return (gmx::selectByMask(potentialInp * sw, mask));
257 }
258
259
260 //! Templated free-energy non-bonded kernel
261 template<typename DataTypes, KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
262 static void nb_free_energy_kernel(const t_nblist&                                  nlist,
263                                   const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
264                                   const int                                        ntype,
265                                   const real                                       rlist,
266                                   const interaction_const_t&                       ic,
267                                   gmx::ArrayRef<const gmx::RVec>                   shiftvec,
268                                   gmx::ArrayRef<const real>                        nbfp,
269                                   gmx::ArrayRef<const real> gmx_unused             nbfp_grid,
270                                   gmx::ArrayRef<const real>                        chargeA,
271                                   gmx::ArrayRef<const real>                        chargeB,
272                                   gmx::ArrayRef<const int>                         typeA,
273                                   gmx::ArrayRef<const int>                         typeB,
274                                   int                                              flags,
275                                   gmx::ArrayRef<const real>                        lambda,
276                                   t_nrnb* gmx_restrict                             nrnb,
277                                   gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
278                                   rvec gmx_unused*    threadForceShiftBuffer,
279                                   gmx::ArrayRef<real> threadVc,
280                                   gmx::ArrayRef<real> threadVv,
281                                   gmx::ArrayRef<real> threadDvdl)
282 {
283 #define STATE_A 0
284 #define STATE_B 1
285 #define NSTATES 2
286
287     using RealType = typename DataTypes::RealType;
288     using IntType  = typename DataTypes::IntType;
289     using BoolType = typename DataTypes::BoolType;
290
291     constexpr real oneTwelfth = 1.0_real / 12.0_real;
292     constexpr real oneSixth   = 1.0_real / 6.0_real;
293     constexpr real zero       = 0.0_real;
294     constexpr real half       = 0.5_real;
295     constexpr real one        = 1.0_real;
296     constexpr real two        = 2.0_real;
297     constexpr real six        = 6.0_real;
298
299     // Extract pair list data
300     const int                nri    = nlist.nri;
301     gmx::ArrayRef<const int> iinr   = nlist.iinr;
302     gmx::ArrayRef<const int> jindex = nlist.jindex;
303     gmx::ArrayRef<const int> jjnr   = nlist.jjnr;
304     gmx::ArrayRef<const int> shift  = nlist.shift;
305     gmx::ArrayRef<const int> gid    = nlist.gid;
306
307     const real  lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
308     const real  lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
309     const auto& scParams    = *ic.softCoreParameters;
310     const real gmx_unused alpha_coul    = scParams.alphaCoulomb;
311     const real gmx_unused alpha_vdw     = scParams.alphaVdw;
312     const real            lam_power     = scParams.lambdaPower;
313     const real gmx_unused sigma6_def    = scParams.sigma6WithInvalidSigma;
314     const real gmx_unused sigma6_min    = scParams.sigma6Minimum;
315     const bool gmx_unused doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
316     const bool            doPotential   = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
317
318     // Extract data from interaction_const_t
319     const real            facel           = ic.epsfac;
320     const real            rCoulomb        = ic.rcoulomb;
321     const real            krf             = ic.reactionFieldCoefficient;
322     const real            crf             = ic.reactionFieldShift;
323     const real gmx_unused shLjEwald       = ic.sh_lj_ewald;
324     const real            rVdw            = ic.rvdw;
325     const real            dispersionShift = ic.dispersion_shift.cpot;
326     const real            repulsionShift  = ic.repulsion_shift.cpot;
327     const real            ewaldBeta       = ic.ewaldcoeff_q;
328     real gmx_unused       ewaldLJCoeffSq;
329     real gmx_unused       ewaldLJCoeffSixDivSix;
330     if constexpr (vdwInteractionTypeIsEwald)
331     {
332         ewaldLJCoeffSq        = ic.ewaldcoeff_lj * ic.ewaldcoeff_lj;
333         ewaldLJCoeffSixDivSix = ewaldLJCoeffSq * ewaldLJCoeffSq * ewaldLJCoeffSq / six;
334     }
335
336     // Note that the nbnxm kernels do not support Coulomb potential switching at all
337     GMX_ASSERT(ic.coulomb_modifier != InteractionModifiers::PotSwitch,
338                "Potential switching is not supported for Coulomb with FEP");
339
340     const real      rVdwSwitch = ic.rvdw_switch;
341     real gmx_unused vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
342     if constexpr (vdwModifierIsPotSwitch)
343     {
344         const real d = rVdw - rVdwSwitch;
345         vdw_swV3     = -10.0_real / (d * d * d);
346         vdw_swV4     = 15.0_real / (d * d * d * d);
347         vdw_swV5     = -6.0_real / (d * d * d * d * d);
348         vdw_swF2     = -30.0_real / (d * d * d);
349         vdw_swF3     = 60.0_real / (d * d * d * d);
350         vdw_swF4     = -30.0_real / (d * d * d * d * d);
351     }
352     else
353     {
354         /* Avoid warnings from stupid compilers (looking at you, Clang!) */
355         vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = zero;
356     }
357
358     NbkernelElecType icoul;
359     if (ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype))
360     {
361         icoul = NbkernelElecType::ReactionField;
362     }
363     else
364     {
365         icoul = NbkernelElecType::None;
366     }
367
368     real rcutoff_max2                 = std::max(ic.rcoulomb, ic.rvdw);
369     rcutoff_max2                      = rcutoff_max2 * rcutoff_max2;
370     const real gmx_unused rCutoffCoul = ic.rcoulomb;
371
372     real gmx_unused sh_ewald = 0;
373     if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
374     {
375         sh_ewald = ic.sh_ewald;
376     }
377
378     /* For Ewald/PME interactions we cannot easily apply the soft-core component to
379      * reciprocal space. When we use non-switched Ewald interactions, we
380      * assume the soft-coring does not significantly affect the grid contribution
381      * and apply the soft-core only to the full 1/r (- shift) pair contribution.
382      *
383      * However, we cannot use this approach for switch-modified since we would then
384      * effectively end up evaluating a significantly different interaction here compared to the
385      * normal (non-free-energy) kernels, either by applying a cutoff at a different
386      * position than what the user requested, or by switching different
387      * things (1/r rather than short-range Ewald). For these settings, we just
388      * use the traditional short-range Ewald interaction in that case.
389      */
390     GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
391                        "Can not apply soft-core to switched Ewald potentials");
392
393     const RealType            minDistanceSquared(c_minDistanceSquared);
394     const RealType            maxRInvSix(c_maxRInvSix);
395     const RealType gmx_unused floatMin(GMX_FLOAT_MIN);
396
397     RealType dvdlCoul(zero);
398     RealType dvdlVdw(zero);
399
400     /* Lambda factor for state A, 1-lambda*/
401     real LFC[NSTATES], LFV[NSTATES];
402     LFC[STATE_A] = one - lambda_coul;
403     LFV[STATE_A] = one - lambda_vdw;
404
405     /* Lambda factor for state B, lambda*/
406     LFC[STATE_B] = lambda_coul;
407     LFV[STATE_B] = lambda_vdw;
408
409     /*derivative of the lambda factor for state A and B */
410     real DLF[NSTATES];
411     DLF[STATE_A] = -one;
412     DLF[STATE_B] = one;
413
414     real gmx_unused lFacCoul[NSTATES], dlFacCoul[NSTATES], lFacVdw[NSTATES], dlFacVdw[NSTATES];
415     constexpr real  sc_r_power = six;
416     for (int i = 0; i < NSTATES; i++)
417     {
418         lFacCoul[i]  = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i]));
419         dlFacCoul[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFC[i]) : 1);
420         lFacVdw[i]   = (lam_power == 2 ? (1 - LFV[i]) * (1 - LFV[i]) : (1 - LFV[i]));
421         dlFacVdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
422     }
423
424     // We need pointers to real for SIMD access
425     const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
426     real* gmx_restrict       forceRealPtr;
427     if constexpr (computeForces)
428     {
429         GMX_ASSERT(nri == 0 || !threadForceBuffer.empty(), "need a valid threadForceBuffer");
430
431         forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
432
433         if (doShiftForces)
434         {
435             GMX_ASSERT(threadForceShiftBuffer != nullptr, "need a valid threadForceShiftBuffer");
436         }
437     }
438
439     const real rlistSquared = gmx::square(rlist);
440
441     bool haveExcludedPairsBeyondRlist = false;
442
443     for (int n = 0; n < nri; n++)
444     {
445         bool havePairsWithinCutoff = false;
446
447         const int  is   = shift[n];
448         const real shX  = shiftvec[is][XX];
449         const real shY  = shiftvec[is][YY];
450         const real shZ  = shiftvec[is][ZZ];
451         const int  nj0  = jindex[n];
452         const int  nj1  = jindex[n + 1];
453         const int  ii   = iinr[n];
454         const int  ii3  = 3 * ii;
455         const real ix   = shX + x[ii3 + 0];
456         const real iy   = shY + x[ii3 + 1];
457         const real iz   = shZ + x[ii3 + 2];
458         const real iqA  = facel * chargeA[ii];
459         const real iqB  = facel * chargeB[ii];
460         const int  ntiA = ntype * typeA[ii];
461         const int  ntiB = ntype * typeB[ii];
462         RealType   vCTot(0);
463         RealType   vVTot(0);
464         RealType   fIX(0);
465         RealType   fIY(0);
466         RealType   fIZ(0);
467
468 #if GMX_SIMD_HAVE_REAL
469         alignas(GMX_SIMD_ALIGNMENT) int            preloadIi[DataTypes::simdRealWidth];
470         alignas(GMX_SIMD_ALIGNMENT) int gmx_unused preloadIs[DataTypes::simdRealWidth];
471 #else
472         int            preloadIi[DataTypes::simdRealWidth];
473         int gmx_unused preloadIs[DataTypes::simdRealWidth];
474 #endif
475         for (int s = 0; s < DataTypes::simdRealWidth; s++)
476         {
477             preloadIi[s] = ii;
478             preloadIs[s] = shift[n];
479         }
480         IntType ii_s = gmx::load<IntType>(preloadIi);
481
482         for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
483         {
484             RealType r, rInv;
485
486 #if GMX_SIMD_HAVE_REAL
487             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIsValid[DataTypes::simdRealWidth];
488             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIncluded[DataTypes::simdRealWidth];
489             alignas(GMX_SIMD_ALIGNMENT) int32_t preloadJnr[DataTypes::simdRealWidth];
490             alignas(GMX_SIMD_ALIGNMENT) int32_t typeIndices[NSTATES][DataTypes::simdRealWidth];
491             alignas(GMX_SIMD_ALIGNMENT) real    preloadQq[NSTATES][DataTypes::simdRealWidth];
492             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
493             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
494             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
495             alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
496 #else
497             real            preloadPairIsValid[DataTypes::simdRealWidth];
498             real            preloadPairIncluded[DataTypes::simdRealWidth];
499             int             preloadJnr[DataTypes::simdRealWidth];
500             int             typeIndices[NSTATES][DataTypes::simdRealWidth];
501             real            preloadQq[NSTATES][DataTypes::simdRealWidth];
502             real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
503             real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
504             real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
505             real            preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
506 #endif
507             for (int s = 0; s < DataTypes::simdRealWidth; s++)
508             {
509                 if (k + s < nj1)
510                 {
511                     preloadPairIsValid[s] = true;
512                     /* Check if this pair on the exclusions list.*/
513                     preloadPairIncluded[s]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + s]);
514                     const int jnr           = jjnr[k + s];
515                     preloadJnr[s]           = jnr;
516                     typeIndices[STATE_A][s] = ntiA + typeA[jnr];
517                     typeIndices[STATE_B][s] = ntiB + typeB[jnr];
518                     preloadQq[STATE_A][s]   = iqA * chargeA[jnr];
519                     preloadQq[STATE_B][s]   = iqB * chargeB[jnr];
520
521                     for (int i = 0; i < NSTATES; i++)
522                     {
523                         if constexpr (vdwInteractionTypeIsEwald)
524                         {
525                             preloadLjPmeC6Grid[i][s] = nbfp_grid[2 * typeIndices[i][s]];
526                         }
527                         else
528                         {
529                             preloadLjPmeC6Grid[i][s] = 0;
530                         }
531                         if constexpr (softcoreType == KernelSoftcoreType::Beutler
532                                       || softcoreType == KernelSoftcoreType::Gapsys)
533                         {
534                             const real c6  = nbfp[2 * typeIndices[i][s]];
535                             const real c12 = nbfp[2 * typeIndices[i][s] + 1];
536                             if (c6 > 0 && c12 > 0)
537                             {
538                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
539                                 preloadSigma6[i][s] = 0.5_real * c12 / c6;
540                                 if (preloadSigma6[i][s]
541                                     < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
542                                 {
543                                     preloadSigma6[i][s] = sigma6_min;
544                                 }
545                             }
546                             else
547                             {
548                                 preloadSigma6[i][s] = sigma6_def;
549                             }
550                         }
551                     }
552                     if constexpr (softcoreType == KernelSoftcoreType::Beutler
553                                   || softcoreType == KernelSoftcoreType::Gapsys)
554                     {
555                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
556                         const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1];
557                         const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1];
558                         if (c12A > 0 && c12B > 0)
559                         {
560                             preloadAlphaVdwEff[s]  = 0;
561                             preloadAlphaCoulEff[s] = 0;
562                         }
563                         else
564                         {
565                             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
566                             {
567                                 preloadAlphaVdwEff[s]  = alpha_vdw;
568                                 preloadAlphaCoulEff[s] = alpha_coul;
569                             }
570                             else if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
571                             {
572                                 preloadAlphaVdwEff[s]  = alpha_vdw;
573                                 preloadAlphaCoulEff[s] = gmx::sixthroot(sigma6_def);
574                             }
575                         }
576                     }
577                 }
578                 else
579                 {
580                     preloadJnr[s]          = jjnr[k];
581                     preloadPairIsValid[s]  = false;
582                     preloadPairIncluded[s] = false;
583                     preloadAlphaVdwEff[s]  = 0;
584                     preloadAlphaCoulEff[s] = 0;
585
586                     for (int i = 0; i < NSTATES; i++)
587                     {
588                         typeIndices[STATE_A][s]  = ntiA + typeA[jjnr[k]];
589                         typeIndices[STATE_B][s]  = ntiB + typeB[jjnr[k]];
590                         preloadLjPmeC6Grid[i][s] = 0;
591                         preloadQq[i][s]          = 0;
592                         preloadSigma6[i][s]      = 0;
593                     }
594                 }
595             }
596
597             RealType jx, jy, jz;
598             gmx::gatherLoadUTranspose<3>(reinterpret_cast<const real*>(x), preloadJnr, &jx, &jy, &jz);
599
600             const RealType pairIsValid   = gmx::load<RealType>(preloadPairIsValid);
601             const RealType pairIncluded  = gmx::load<RealType>(preloadPairIncluded);
602             const BoolType bPairIncluded = (pairIncluded != zero);
603             const BoolType bPairExcluded = (pairIncluded == zero && pairIsValid != zero);
604
605             const RealType dX  = ix - jx;
606             const RealType dY  = iy - jy;
607             const RealType dZ  = iz - jz;
608             RealType       rSq = dX * dX + dY * dY + dZ * dZ;
609
610             BoolType withinCutoffMask = (rSq < rcutoff_max2);
611
612             if (!gmx::anyTrue(withinCutoffMask || bPairExcluded))
613             {
614                 /* We save significant time by skipping all code below.
615                  * Note that with soft-core interactions, the actual cut-off
616                  * check might be different. But since the soft-core distance
617                  * is always larger than r, checking on r here is safe.
618                  * Exclusions outside the cutoff can not be skipped as
619                  * when using Ewald: the reciprocal-space
620                  * Ewald component still needs to be subtracted.
621                  */
622                 continue;
623             }
624             else
625             {
626                 havePairsWithinCutoff = true;
627             }
628
629             if (gmx::anyTrue(rlistSquared < rSq && bPairExcluded))
630             {
631                 haveExcludedPairsBeyondRlist = true;
632             }
633
634             const IntType  jnr_s    = gmx::load<IntType>(preloadJnr);
635             const BoolType bIiEqJnr = gmx::cvtIB2B(ii_s == jnr_s);
636
637             RealType            c6[NSTATES];
638             RealType            c12[NSTATES];
639             RealType gmx_unused sigma6[NSTATES];
640             RealType            qq[NSTATES];
641             RealType gmx_unused ljPmeC6Grid[NSTATES];
642             RealType gmx_unused alphaVdwEff;
643             RealType gmx_unused alphaCoulEff;
644             for (int i = 0; i < NSTATES; i++)
645             {
646                 gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]);
647                 qq[i]          = gmx::load<RealType>(preloadQq[i]);
648                 ljPmeC6Grid[i] = gmx::load<RealType>(preloadLjPmeC6Grid[i]);
649                 if constexpr (softcoreType == KernelSoftcoreType::Beutler
650                               || softcoreType == KernelSoftcoreType::Gapsys)
651                 {
652                     sigma6[i] = gmx::load<RealType>(preloadSigma6[i]);
653                 }
654             }
655             if constexpr (softcoreType == KernelSoftcoreType::Beutler
656                           || softcoreType == KernelSoftcoreType::Gapsys)
657             {
658                 alphaVdwEff  = gmx::load<RealType>(preloadAlphaVdwEff);
659                 alphaCoulEff = gmx::load<RealType>(preloadAlphaCoulEff);
660             }
661
662             // Avoid overflow of r^-12 at distances near zero
663             rSq  = gmx::max(rSq, minDistanceSquared);
664             rInv = gmx::invsqrt(rSq);
665             r    = rSq * rInv;
666
667             RealType gmx_unused rp, rpm2;
668             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
669             {
670                 rpm2 = rSq * rSq;  /* r4 */
671                 rp   = rpm2 * rSq; /* r6 */
672             }
673             else
674             {
675                 /* The soft-core power p will not affect the results
676                  * with not using soft-core, so we use power of 0 which gives
677                  * the simplest math and cheapest code.
678                  */
679                 rpm2 = rInv * rInv;
680                 rp   = one;
681             }
682
683             RealType fScal(0);
684
685             /* The following block is masked to only calculate values having bPairIncluded. If
686              * bPairIncluded is true then withinCutoffMask must also be true. */
687             if (gmx::anyTrue(withinCutoffMask && bPairIncluded))
688             {
689                 RealType gmx_unused fScalC[NSTATES], fScalV[NSTATES];
690                 RealType            vCoul[NSTATES], vVdw[NSTATES];
691                 for (int i = 0; i < NSTATES; i++)
692                 {
693                     fScalC[i] = zero;
694                     fScalV[i] = zero;
695                     vCoul[i]  = zero;
696                     vVdw[i]   = zero;
697
698                     RealType gmx_unused rInvC, rInvV, rC, rV, rPInvC, rPInvV;
699
700                     /* The following block is masked to require (qq[i] != 0 || c6[i] != 0 || c12[i]
701                      * != 0) in addition to bPairIncluded, which in turn requires withinCutoffMask. */
702                     BoolType nonZeroState = ((qq[i] != zero || c6[i] != zero || c12[i] != zero)
703                                              && bPairIncluded && withinCutoffMask);
704                     if (gmx::anyTrue(nonZeroState))
705                     {
706                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
707                         {
708                             RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
709                             rPInvC           = gmx::inv(divisor);
710                             sixthRoot(rPInvC, &rInvC, &rC);
711
712                             if constexpr (scLambdasOrAlphasDiffer)
713                             {
714                                 RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
715                                 rPInvV           = gmx::inv(divisor);
716                                 sixthRoot(rPInvV, &rInvV, &rV);
717                             }
718                             else
719                             {
720                                 /* We can avoid one expensive pow and one / operation */
721                                 rPInvV = rPInvC;
722                                 rInvV  = rInvC;
723                                 rV     = rC;
724                             }
725                         }
726                         else
727                         {
728                             rPInvC = one;
729                             rInvC  = rInv;
730                             rC     = r;
731
732                             rPInvV = one;
733                             rInvV  = rInv;
734                             rV     = r;
735                         }
736
737                         /* Only process the coulomb interactions if we either
738                          * include all entries in the list (no cutoff
739                          * used in the kernel), or if we are within the cutoff.
740                          */
741                         BoolType computeElecInteraction;
742                         if constexpr (elecInteractionTypeIsEwald)
743                         {
744                             computeElecInteraction = (r < rCoulomb && qq[i] != zero && bPairIncluded);
745                         }
746                         else
747                         {
748                             computeElecInteraction = (rC < rCoulomb && qq[i] != zero && bPairIncluded);
749                         }
750                         if (gmx::anyTrue(computeElecInteraction))
751                         {
752                             if constexpr (elecInteractionTypeIsEwald)
753                             {
754                                 vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald);
755                                 if constexpr (computeForces)
756                                 {
757                                     fScalC[i] = ewaldScalarForce(qq[i], rInvC);
758                                 }
759
760                                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
761                                 {
762                                     ewaldQuadraticPotential(qq[i],
763                                                             facel,
764                                                             rC,
765                                                             rCutoffCoul,
766                                                             LFC[i],
767                                                             DLF[i],
768                                                             alphaCoulEff,
769                                                             sh_ewald,
770                                                             &fScalC[i],
771                                                             &vCoul[i],
772                                                             &dvdlCoul,
773                                                             computeElecInteraction);
774                                 }
775                             }
776                             else
777                             {
778                                 vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
779                                 if constexpr (computeForces)
780                                 {
781                                     fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
782                                 }
783
784                                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
785                                 {
786                                     reactionFieldQuadraticPotential(qq[i],
787                                                                     facel,
788                                                                     rC,
789                                                                     rCutoffCoul,
790                                                                     LFC[i],
791                                                                     DLF[i],
792                                                                     alphaCoulEff,
793                                                                     krf,
794                                                                     crf,
795                                                                     &fScalC[i],
796                                                                     &vCoul[i],
797                                                                     &dvdlCoul,
798                                                                     computeElecInteraction);
799                                 }
800                             }
801
802                             vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction);
803                             if constexpr (computeForces)
804                             {
805                                 fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
806                             }
807                         }
808
809                         /* Only process the VDW interactions if we either
810                          * include all entries in the list (no cutoff used
811                          * in the kernel), or if we are within the cutoff.
812                          */
813                         BoolType computeVdwInteraction;
814                         if constexpr (vdwInteractionTypeIsEwald)
815                         {
816                             computeVdwInteraction =
817                                     (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
818                         }
819                         else
820                         {
821                             computeVdwInteraction =
822                                     (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
823                         }
824                         if (gmx::anyTrue(computeVdwInteraction))
825                         {
826                             RealType rInv6;
827                             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
828                             {
829                                 rInv6 = rPInvV;
830                             }
831                             else
832                             {
833                                 rInv6 = calculateRinv6(rInvV);
834                             }
835                             // Avoid overflow at short distance for masked exclusions and
836                             // for foreign energy calculations at a hard core end state.
837                             // Note that we should limit r^-6, and thus also r^-12, and
838                             // not only r^-12, as that could lead to erroneously low instead
839                             // of very high foreign energies.
840                             rInv6           = gmx::min(rInv6, maxRInvSix);
841                             RealType vVdw6  = calculateVdw6(c6[i], rInv6);
842                             RealType vVdw12 = calculateVdw12(c12[i], rInv6);
843
844                             vVdw[i] = lennardJonesPotential(
845                                     vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth);
846                             if constexpr (computeForces)
847                             {
848                                 fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
849                             }
850
851                             if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
852                             {
853                                 lennardJonesQuadraticPotential(c6[i],
854                                                                c12[i],
855                                                                r,
856                                                                rSq,
857                                                                LFV[i],
858                                                                DLF[i],
859                                                                sigma6[i],
860                                                                alphaVdwEff,
861                                                                repulsionShift,
862                                                                dispersionShift,
863                                                                &fScalV[i],
864                                                                &vVdw[i],
865                                                                &dvdlVdw,
866                                                                computeVdwInteraction);
867                             }
868
869                             if constexpr (vdwInteractionTypeIsEwald)
870                             {
871                                 /* Subtract the grid potential at the cut-off */
872                                 vVdw[i] = vVdw[i]
873                                           + gmx::selectByMask(ewaldLennardJonesGridSubtract(
874                                                                       ljPmeC6Grid[i], shLjEwald, oneSixth),
875                                                               computeVdwInteraction);
876                             }
877
878                             if constexpr (vdwModifierIsPotSwitch)
879                             {
880                                 RealType d             = rV - rVdwSwitch;
881                                 BoolType zeroMask      = zero < d;
882                                 BoolType potSwitchMask = rV < rVdw;
883                                 d                      = gmx::selectByMask(d, zeroMask);
884                                 const RealType d2      = d * d;
885                                 const RealType sw =
886                                         one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
887
888                                 if constexpr (computeForces)
889                                 {
890                                     const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
891                                     fScalV[i]          = potSwitchScalarForceMod(
892                                             fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
893                                 }
894                                 vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask);
895                             }
896
897                             vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction);
898                             if constexpr (computeForces)
899                             {
900                                 fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
901                             }
902                         }
903
904                         if constexpr (computeForces)
905                         {
906                             /* fScalC (and fScalV) now contain: dV/drC * rC
907                              * Now we multiply by rC^-6, so it will be: dV/drC * rC^-5
908                              * Further down we first multiply by r^4 and then by
909                              * the vector r, which in total gives: dV/drC * (r/rC)^-5
910                              */
911                             fScalC[i] = fScalC[i] * rPInvC;
912                             fScalV[i] = fScalV[i] * rPInvV;
913                         }
914                     } // end of block requiring nonZeroState
915                 }     // end for (int i = 0; i < NSTATES; i++)
916
917                 /* Assemble A and B states. */
918                 BoolType assembleStates = (bPairIncluded && withinCutoffMask);
919                 if (gmx::anyTrue(assembleStates))
920                 {
921                     for (int i = 0; i < NSTATES; i++)
922                     {
923                         vCTot = vCTot + LFC[i] * vCoul[i];
924                         vVTot = vVTot + LFV[i] * vVdw[i];
925
926                         if constexpr (computeForces)
927                         {
928                             fScal = fScal + LFC[i] * fScalC[i] * rpm2;
929                             fScal = fScal + LFV[i] * fScalV[i] * rpm2;
930                         }
931
932                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
933                         {
934                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]
935                                        + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i];
936                             dvdlVdw = dvdlVdw + vVdw[i] * DLF[i]
937                                       + LFV[i] * alphaVdwEff * dlFacVdw[i] * fScalV[i] * sigma6[i];
938                         }
939                         else
940                         {
941                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i];
942                             dvdlVdw  = dvdlVdw + vVdw[i] * DLF[i];
943                         }
944                     }
945                 }
946             } // end of block requiring bPairIncluded && withinCutoffMask
947             /* In the following block bPairIncluded should be false in the masks. */
948             if (icoul == NbkernelElecType::ReactionField)
949             {
950                 const BoolType computeReactionField = bPairExcluded;
951
952                 if (gmx::anyTrue(computeReactionField))
953                 {
954                     /* For excluded pairs we don't use soft-core.
955                      * As there is no singularity, there is no need for soft-core.
956                      */
957                     const RealType FF = -two * krf;
958                     RealType       VV = krf * rSq - crf;
959
960                     /* If ii == jnr the i particle (ii) has itself (jnr)
961                      * in its neighborlist. This corresponds to a self-interaction
962                      * that will occur twice. Scale it down by 50% to only include
963                      * it once.
964                      */
965                     VV = VV * gmx::blend(one, half, bIiEqJnr);
966
967                     for (int i = 0; i < NSTATES; i++)
968                     {
969                         vCTot = vCTot + gmx::selectByMask(LFC[i] * qq[i] * VV, computeReactionField);
970                         fScal = fScal + gmx::selectByMask(LFC[i] * qq[i] * FF, computeReactionField);
971                         dvdlCoul = dvdlCoul + gmx::selectByMask(DLF[i] * qq[i] * VV, computeReactionField);
972                     }
973                 }
974             }
975
976             const BoolType computeElecEwaldInteraction = (bPairExcluded || r < rCoulomb);
977             if (elecInteractionTypeIsEwald && gmx::anyTrue(computeElecEwaldInteraction))
978             {
979                 /* See comment in the preamble. When using Ewald interactions
980                  * (unless we use a switch modifier) we subtract the reciprocal-space
981                  * Ewald component here which made it possible to apply the free
982                  * energy interaction to 1/r (vanilla coulomb short-range part)
983                  * above. This gets us closer to the ideal case of applying
984                  * the softcore to the entire electrostatic interaction,
985                  * including the reciprocal-space component.
986                  */
987                 RealType v_lr, f_lr;
988
989                 pmeCoulombCorrectionVF<computeForces>(rSq, ewaldBeta, &v_lr, &f_lr);
990                 if constexpr (computeForces)
991                 {
992                     f_lr = f_lr * rInv * rInv;
993                 }
994
995                 /* Note that any possible Ewald shift has already been applied in
996                  * the normal interaction part above.
997                  */
998
999                 /* If ii == jnr the i particle (ii) has itself (jnr)
1000                  * in its neighborlist. This corresponds to a self-interaction
1001                  * that will occur twice. Scale it down by 50% to only include
1002                  * it once.
1003                  */
1004                 v_lr = v_lr * gmx::blend(one, half, bIiEqJnr);
1005
1006                 for (int i = 0; i < NSTATES; i++)
1007                 {
1008                     vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1009                     if constexpr (computeForces)
1010                     {
1011                         fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
1012                     }
1013                     dvdlCoul = dvdlCoul
1014                                - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1015                 }
1016             }
1017
1018             const BoolType computeVdwEwaldInteraction = (bPairExcluded || r < rVdw);
1019             if (vdwInteractionTypeIsEwald && gmx::anyTrue(computeVdwEwaldInteraction))
1020             {
1021                 /* See comment in the preamble. When using LJ-Ewald interactions
1022                  * (unless we use a switch modifier) we subtract the reciprocal-space
1023                  * Ewald component here which made it possible to apply the free
1024                  * energy interaction to r^-6 (vanilla LJ6 short-range part)
1025                  * above. This gets us closer to the ideal case of applying
1026                  * the softcore to the entire VdW interaction,
1027                  * including the reciprocal-space component.
1028                  */
1029
1030                 RealType v_lr, f_lr;
1031                 pmeLJCorrectionVF<computeForces>(
1032                         rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr);
1033                 v_lr = v_lr * oneSixth;
1034
1035                 for (int i = 0; i < NSTATES; i++)
1036                 {
1037                     vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1038                     if constexpr (computeForces)
1039                     {
1040                         fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
1041                     }
1042                     dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1043                 }
1044             }
1045
1046             if (computeForces && gmx::anyTrue(fScal != zero))
1047             {
1048                 const RealType tX = fScal * dX;
1049                 const RealType tY = fScal * dY;
1050                 const RealType tZ = fScal * dZ;
1051                 fIX               = fIX + tX;
1052                 fIY               = fIY + tY;
1053                 fIZ               = fIZ + tZ;
1054
1055                 gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
1056             }
1057         } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
1058
1059         if (havePairsWithinCutoff)
1060         {
1061             if constexpr (computeForces)
1062             {
1063                 gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
1064
1065                 if (doShiftForces)
1066                 {
1067                     gmx::transposeScatterIncrU<3>(
1068                             reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
1069                 }
1070             }
1071             if (doPotential)
1072             {
1073                 int ggid = gid[n];
1074                 threadVc[ggid] += gmx::reduce(vCTot);
1075                 threadVv[ggid] += gmx::reduce(vVTot);
1076             }
1077         }
1078     } // end for (int n = 0; n < nri; n++)
1079
1080     if (gmx::anyTrue(dvdlCoul != zero))
1081     {
1082         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)] += gmx::reduce(dvdlCoul);
1083     }
1084     if (gmx::anyTrue(dvdlVdw != zero))
1085     {
1086         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)] += gmx::reduce(dvdlVdw);
1087     }
1088
1089     /* Estimate flops, average for free energy stuff:
1090      * 12  flops per outer iteration
1091      * 150 flops per inner iteration
1092      * TODO: Update the number of flops and/or use different counts for different code paths.
1093      */
1094     atomicNrnbIncrement(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist.nri * 12 + nlist.jindex[nri] * 150);
1095
1096     if (haveExcludedPairsBeyondRlist > 0)
1097     {
1098         gmx_fatal(FARGS,
1099                   "There are perturbed non-bonded pair interactions beyond the pair-list cutoff "
1100                   "of %g nm, which is not supported. This can happen because the system is "
1101                   "unstable or because intra-molecular interactions at long distances are "
1102                   "excluded. If the "
1103                   "latter is the case, you can try to increase nstlist or rlist to avoid this."
1104                   "The error is likely triggered by the use of couple-intramol=no "
1105                   "and the maximal distance in the decoupled molecule exceeding rlist.",
1106                   rlist);
1107     }
1108 }
1109
1110 typedef void (*KernelFunction)(const t_nblist&                                  nlist,
1111                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1112                                const int                                        ntype,
1113                                const real                                       rlist,
1114                                const interaction_const_t&                       ic,
1115                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1116                                gmx::ArrayRef<const real>                        nbfp,
1117                                gmx::ArrayRef<const real>                        nbfp_grid,
1118                                gmx::ArrayRef<const real>                        chargeA,
1119                                gmx::ArrayRef<const real>                        chargeB,
1120                                gmx::ArrayRef<const int>                         typeA,
1121                                gmx::ArrayRef<const int>                         typeB,
1122                                int                                              flags,
1123                                gmx::ArrayRef<const real>                        lambda,
1124                                t_nrnb* gmx_restrict                             nrnb,
1125                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1126                                rvec*               threadForceShiftBuffer,
1127                                gmx::ArrayRef<real> threadVc,
1128                                gmx::ArrayRef<real> threadVv,
1129                                gmx::ArrayRef<real> threadDvdl);
1130
1131 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
1132 static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
1133 {
1134     if (useSimd)
1135     {
1136 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
1137         return (nb_free_energy_kernel<SimdDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1138 #else
1139         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch computeForces>);
1140 #endif
1141     }
1142     else
1143     {
1144         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1145     }
1146 }
1147
1148 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
1149 static KernelFunction dispatchKernelOnComputeForces(const bool computeForces, const bool useSimd)
1150 {
1151     if (computeForces)
1152     {
1153         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, true>(
1154                 useSimd));
1155     }
1156     else
1157     {
1158         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, false>(
1159                 useSimd));
1160     }
1161 }
1162
1163 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
1164 static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch,
1165                                                   const bool computeForces,
1166                                                   const bool useSimd)
1167 {
1168     if (vdwModifierIsPotSwitch)
1169     {
1170         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
1171                 computeForces, useSimd));
1172     }
1173     else
1174     {
1175         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
1176                 computeForces, useSimd));
1177     }
1178 }
1179
1180 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
1181 static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
1182                                                           const bool vdwModifierIsPotSwitch,
1183                                                           const bool computeForces,
1184                                                           const bool useSimd)
1185 {
1186     if (elecInteractionTypeIsEwald)
1187     {
1188         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
1189                 vdwModifierIsPotSwitch, computeForces, useSimd));
1190     }
1191     else
1192     {
1193         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
1194                 vdwModifierIsPotSwitch, computeForces, useSimd));
1195     }
1196 }
1197
1198 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer>
1199 static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
1200                                                          const bool elecInteractionTypeIsEwald,
1201                                                          const bool vdwModifierIsPotSwitch,
1202                                                          const bool computeForces,
1203                                                          const bool useSimd)
1204 {
1205     if (vdwInteractionTypeIsEwald)
1206     {
1207         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, true>(
1208                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1209     }
1210     else
1211     {
1212         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, false>(
1213                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1214     }
1215 }
1216
1217 template<KernelSoftcoreType softcoreType>
1218 static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
1219                                                                   const bool vdwInteractionTypeIsEwald,
1220                                                                   const bool elecInteractionTypeIsEwald,
1221                                                                   const bool vdwModifierIsPotSwitch,
1222                                                                   const bool computeForces,
1223                                                                   const bool useSimd)
1224 {
1225     if (scLambdasOrAlphasDiffer)
1226     {
1227         return (dispatchKernelOnVdwInteractionType<softcoreType, true>(
1228                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1229     }
1230     else
1231     {
1232         return (dispatchKernelOnVdwInteractionType<softcoreType, false>(
1233                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1234     }
1235 }
1236
1237 static KernelFunction dispatchKernel(const bool                 scLambdasOrAlphasDiffer,
1238                                      const bool                 vdwInteractionTypeIsEwald,
1239                                      const bool                 elecInteractionTypeIsEwald,
1240                                      const bool                 vdwModifierIsPotSwitch,
1241                                      const bool                 computeForces,
1242                                      const bool                 useSimd,
1243                                      const interaction_const_t& ic)
1244 {
1245     if (ic.softCoreParameters->alphaCoulomb == 0 && ic.softCoreParameters->alphaVdw == 0)
1246     {
1247         return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
1248                 scLambdasOrAlphasDiffer,
1249                 vdwInteractionTypeIsEwald,
1250                 elecInteractionTypeIsEwald,
1251                 vdwModifierIsPotSwitch,
1252                 computeForces,
1253                 useSimd));
1254     }
1255     else
1256     {
1257         if (ic.softCoreParameters->softcoreType == SoftcoreType::Beutler)
1258         {
1259             return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Beutler>(
1260                     scLambdasOrAlphasDiffer,
1261                     vdwInteractionTypeIsEwald,
1262                     elecInteractionTypeIsEwald,
1263                     vdwModifierIsPotSwitch,
1264                     computeForces,
1265                     useSimd));
1266         }
1267         else
1268         {
1269             return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Gapsys>(
1270                     scLambdasOrAlphasDiffer,
1271                     vdwInteractionTypeIsEwald,
1272                     elecInteractionTypeIsEwald,
1273                     vdwModifierIsPotSwitch,
1274                     computeForces,
1275                     useSimd));
1276         }
1277     }
1278 }
1279
1280
1281 void gmx_nb_free_energy_kernel(const t_nblist&                                  nlist,
1282                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1283                                const bool                                       useSimd,
1284                                const int                                        ntype,
1285                                const real                                       rlist,
1286                                const interaction_const_t&                       ic,
1287                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1288                                gmx::ArrayRef<const real>                        nbfp,
1289                                gmx::ArrayRef<const real>                        nbfp_grid,
1290                                gmx::ArrayRef<const real>                        chargeA,
1291                                gmx::ArrayRef<const real>                        chargeB,
1292                                gmx::ArrayRef<const int>                         typeA,
1293                                gmx::ArrayRef<const int>                         typeB,
1294                                int                                              flags,
1295                                gmx::ArrayRef<const real>                        lambda,
1296                                t_nrnb*                                          nrnb,
1297                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1298                                rvec*               threadForceShiftBuffer,
1299                                gmx::ArrayRef<real> threadVc,
1300                                gmx::ArrayRef<real> threadVv,
1301                                gmx::ArrayRef<real> threadDvdl)
1302 {
1303     GMX_ASSERT(EEL_PME_EWALD(ic.eeltype) || ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype),
1304                "Unsupported eeltype with free energy");
1305     GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
1306
1307     // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
1308     GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
1309                        || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
1310                "We need actual padding with at least one element for SIMD scatter operations");
1311
1312     const auto& scParams                   = *ic.softCoreParameters;
1313     const bool  vdwInteractionTypeIsEwald  = (EVDW_PME(ic.vdwtype));
1314     const bool  elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
1315     const bool  vdwModifierIsPotSwitch     = (ic.vdw_modifier == InteractionModifiers::PotSwitch);
1316     const bool  computeForces              = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
1317     bool        scLambdasOrAlphasDiffer    = true;
1318
1319     if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
1320     {
1321         scLambdasOrAlphasDiffer = false;
1322     }
1323     else
1324     {
1325         if (lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1326                     == lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]
1327             && scParams.alphaCoulomb == scParams.alphaVdw)
1328         {
1329             scLambdasOrAlphasDiffer = false;
1330         }
1331     }
1332
1333     KernelFunction kernelFunc;
1334     kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer,
1335                                 vdwInteractionTypeIsEwald,
1336                                 elecInteractionTypeIsEwald,
1337                                 vdwModifierIsPotSwitch,
1338                                 computeForces,
1339                                 useSimd,
1340                                 ic);
1341     kernelFunc(nlist,
1342                coords,
1343                ntype,
1344                rlist,
1345                ic,
1346                shiftvec,
1347                nbfp,
1348                nbfp_grid,
1349                chargeA,
1350                chargeB,
1351                typeA,
1352                typeB,
1353                flags,
1354                lambda,
1355                nrnb,
1356                threadForceBuffer,
1357                threadForceShiftBuffer,
1358                threadVc,
1359                threadVv,
1360                threadDvdl);
1361 }