Add gapsys softcore function.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_free_energy.cpp
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
5  * Copyright (c) 2001-2004, The GROMACS development team.
6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
9  * and including many others, as listed in the AUTHORS file in the
10  * top-level source directory and at http://www.gromacs.org.
11  *
12  * GROMACS is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public License
14  * as published by the Free Software Foundation; either version 2.1
15  * of the License, or (at your option) any later version.
16  *
17  * GROMACS is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with GROMACS; if not, see
24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
26  *
27  * If you want to redistribute modifications to GROMACS, please
28  * consider that scientific software is very special. Version
29  * control is crucial - bugs must be traceable. We will be happy to
30  * consider code for inclusion in the official distribution, but
31  * derived work must not be called official GROMACS. Details are found
32  * in the README & COPYING files - if they are missing, get the
33  * official version at http://www.gromacs.org.
34  *
35  * To help us fund GROMACS development, we humbly ask that you cite
36  * the research papers on the package. Check out http://www.gromacs.org.
37  */
38 #include "gmxpre.h"
39
40 #include "nb_free_energy.h"
41
42 #include "config.h"
43
44 #include <cmath>
45 #include <set>
46
47 #include <algorithm>
48
49 #include "gromacs/gmxlib/nrnb.h"
50 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
51 #include "gromacs/math/arrayrefwithpadding.h"
52 #include "gromacs/math/functions.h"
53 #include "gromacs/math/vec.h"
54 #include "gromacs/mdtypes/forceoutput.h"
55 #include "gromacs/mdtypes/forcerec.h"
56 #include "gromacs/mdtypes/interaction_const.h"
57 #include "gromacs/mdtypes/md_enums.h"
58 #include "gromacs/mdtypes/mdatom.h"
59 #include "gromacs/mdtypes/nblist.h"
60 #include "gromacs/pbcutil/ishift.h"
61 #include "gromacs/simd/simd.h"
62 #include "gromacs/simd/simd_math.h"
63 #include "gromacs/utility/fatalerror.h"
64 #include "gromacs/utility/arrayref.h"
65
66 #include "nb_softcore.h"
67
68 //! Scalar (non-SIMD) data types.
69 struct ScalarDataTypes
70 {
71     using RealType = real; //!< The data type to use as real.
72     using IntType  = int;  //!< The data type to use as int.
73     using BoolType = bool; //!< The data type to use as bool for real value comparison.
74     static constexpr int simdRealWidth = 1; //!< The width of the RealType.
75     static constexpr int simdIntWidth  = 1; //!< The width of the IntType.
76 };
77
78 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
79 //! SIMD data types.
80 struct SimdDataTypes
81 {
82     using RealType = gmx::SimdReal;  //!< The data type to use as real.
83     using IntType  = gmx::SimdInt32; //!< The data type to use as int.
84     using BoolType = gmx::SimdBool;  //!< The data type to use as bool for real value comparison.
85     static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH; //!< The width of the RealType.
86 #    if GMX_SIMD_HAVE_DOUBLE && GMX_DOUBLE
87     static constexpr int simdIntWidth = GMX_SIMD_DINT32_WIDTH; //!< The width of the IntType.
88 #    else
89     static constexpr int simdIntWidth = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType.
90 #    endif
91 };
92 #endif
93
94 /*! \brief Lower limit for square interaction distances in nonbonded kernels.
95  *
96  * This is a mimimum on r^2 to avoid overflows when computing r^6.
97  * This will only affect results for soft-cored interaction at distances smaller
98  * than 1e-6 and will limit extremely high foreign energies for overlapping atoms.
99  * Note that we could use a somewhat smaller minimum in double precision.
100  * But because invsqrt in double precision can use single precision, this number
101  * can not be much smaller, we use the same number for simplicity.
102  */
103 constexpr real c_minDistanceSquared = 1.0e-12_real;
104
105 /*! \brief Higher limit for r^-6 used for Lennard-Jones interactions
106  *
107  * This is needed to avoid overflow of LJ energy and force terms for excluded
108  * atoms and foreign energies of hard-core states of overlapping atoms.
109  * Note that in single precision this value leaves room for C12 coefficients up to 3.4e8.
110  */
111 constexpr real c_maxRInvSix = 1.0e15_real;
112
113 template<bool computeForces, class RealType>
114 static inline void
115 pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType gmx_unused* force)
116 {
117     const RealType brsq = rSq * beta * beta;
118     if constexpr (computeForces)
119     {
120         *force = -brsq * beta * gmx::pmeForceCorrection(brsq);
121     }
122     *pot = beta * gmx::pmePotentialCorrection(brsq);
123 }
124
125 template<bool computeForces, class RealType, class BoolType>
126 static inline void pmeLJCorrectionVF(const RealType rInv,
127                                      const RealType rSq,
128                                      const real     ewaldLJCoeffSq,
129                                      const real     ewaldLJCoeffSixDivSix,
130                                      RealType*      pot,
131                                      RealType gmx_unused* force,
132                                      const BoolType       mask,
133                                      const BoolType       bIiEqJnr)
134 {
135     // We mask rInv to get zero force and potential for masked out pair interactions
136     const RealType rInvSq  = rInv * rInv;
137     const RealType rInvSix = rInvSq * rInvSq * rInvSq;
138     // Mask rSq to avoid underflow in exp()
139     const RealType coeffSqRSq       = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask);
140     const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq);
141     const RealType poly             = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq;
142     if constexpr (computeForces)
143     {
144         *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
145         *force = *force * rInvSq;
146     }
147     // The self interaction is the limit for r -> 0 which we need to compute separately
148     *pot = gmx::blend(
149             rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr);
150 }
151
152 //! Computes r^(1/6) and 1/r^(1/6)
153 template<class RealType>
154 static inline void sixthRoot(const RealType r, RealType* sixthRoot, RealType* invSixthRoot)
155 {
156     RealType cbrtRes = gmx::cbrt(r);
157     *invSixthRoot    = gmx::invsqrt(cbrtRes);
158     *sixthRoot       = gmx::inv(*invSixthRoot);
159 }
160
161 template<class RealType>
162 static inline RealType calculateRinv6(const RealType rInvV)
163 {
164     RealType rInv6 = rInvV * rInvV;
165     return (rInv6 * rInv6 * rInv6);
166 }
167
168 template<class RealType>
169 static inline RealType calculateVdw6(const RealType c6, const RealType rInv6)
170 {
171     return (c6 * rInv6);
172 }
173
174 template<class RealType>
175 static inline RealType calculateVdw12(const RealType c12, const RealType rInv6)
176 {
177     return (c12 * rInv6 * rInv6);
178 }
179
180 /* reaction-field electrostatics */
181 template<class RealType>
182 static inline RealType reactionFieldScalarForce(const RealType qq,
183                                                 const RealType rInv,
184                                                 const RealType r,
185                                                 const real     krf,
186                                                 const real     two)
187 {
188     return (qq * (rInv - two * krf * r * r));
189 }
190 template<class RealType>
191 static inline RealType reactionFieldPotential(const RealType qq,
192                                               const RealType rInv,
193                                               const RealType r,
194                                               const real     krf,
195                                               const real     potentialShift)
196 {
197     return (qq * (rInv + krf * r * r - potentialShift));
198 }
199
200 /* Ewald electrostatics */
201 template<class RealType>
202 static inline RealType ewaldScalarForce(const RealType coulomb, const RealType rInv)
203 {
204     return (coulomb * rInv);
205 }
206 template<class RealType>
207 static inline RealType ewaldPotential(const RealType coulomb, const RealType rInv, const real potentialShift)
208 {
209     return (coulomb * (rInv - potentialShift));
210 }
211
212 /* cutoff LJ */
213 template<class RealType>
214 static inline RealType lennardJonesScalarForce(const RealType v6, const RealType v12)
215 {
216     return (v12 - v6);
217 }
218 template<class RealType>
219 static inline RealType lennardJonesPotential(const RealType v6,
220                                              const RealType v12,
221                                              const RealType c6,
222                                              const RealType c12,
223                                              const real     repulsionShift,
224                                              const real     dispersionShift,
225                                              const real     oneSixth,
226                                              const real     oneTwelfth)
227 {
228     return ((v12 + c12 * repulsionShift) * oneTwelfth - (v6 + c6 * dispersionShift) * oneSixth);
229 }
230
231 /* Ewald LJ */
232 template<class RealType>
233 static inline RealType ewaldLennardJonesGridSubtract(const RealType c6grid,
234                                                      const real     potentialShift,
235                                                      const real     oneSixth)
236 {
237     return (c6grid * potentialShift * oneSixth);
238 }
239
240 /* LJ Potential switch */
241 template<class RealType, class BoolType>
242 static inline RealType potSwitchScalarForceMod(const RealType fScalarInp,
243                                                const RealType potential,
244                                                const RealType sw,
245                                                const RealType r,
246                                                const RealType dsw,
247                                                const BoolType mask)
248 {
249     /* The mask should select on rV < rVdw */
250     return (gmx::selectByMask(fScalarInp * sw - r * potential * dsw, mask));
251 }
252 template<class RealType, class BoolType>
253 static inline RealType potSwitchPotentialMod(const RealType potentialInp, const RealType sw, const BoolType mask)
254 {
255     /* The mask should select on rV < rVdw */
256     return (gmx::selectByMask(potentialInp * sw, mask));
257 }
258
259
260 //! Templated free-energy non-bonded kernel
261 template<typename DataTypes, SoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForce>
262 static void nb_free_energy_kernel(const t_nblist&                                  nlist,
263                                   const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
264                                   const int                                        ntype,
265                                   const real                                       rlist,
266                                   const interaction_const_t&                       ic,
267                                   gmx::ArrayRef<const gmx::RVec>                   shiftvec,
268                                   gmx::ArrayRef<const real>                        nbfp,
269                                   gmx::ArrayRef<const real> gmx_unused             nbfp_grid,
270                                   gmx::ArrayRef<const real>                        chargeA,
271                                   gmx::ArrayRef<const real>                        chargeB,
272                                   gmx::ArrayRef<const int>                         typeA,
273                                   gmx::ArrayRef<const int>                         typeB,
274                                   int                                              flags,
275                                   gmx::ArrayRef<const real>                        lambda,
276                                   t_nrnb* gmx_restrict                             nrnb,
277                                   gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
278                                   rvec gmx_unused*    threadForceShiftBuffer,
279                                   gmx::ArrayRef<real> threadVc,
280                                   gmx::ArrayRef<real> threadVv,
281                                   gmx::ArrayRef<real> threadDvdl)
282 {
283 #define STATE_A 0
284 #define STATE_B 1
285 #define NSTATES 2
286
287     using RealType = typename DataTypes::RealType;
288     using IntType  = typename DataTypes::IntType;
289     using BoolType = typename DataTypes::BoolType;
290
291     constexpr real oneTwelfth = 1.0_real / 12.0_real;
292     constexpr real oneSixth   = 1.0_real / 6.0_real;
293     constexpr real zero       = 0.0_real;
294     constexpr real half       = 0.5_real;
295     constexpr real one        = 1.0_real;
296     constexpr real two        = 2.0_real;
297     constexpr real six        = 6.0_real;
298
299     // Extract pair list data
300     const int                nri    = nlist.nri;
301     gmx::ArrayRef<const int> iinr   = nlist.iinr;
302     gmx::ArrayRef<const int> jindex = nlist.jindex;
303     gmx::ArrayRef<const int> jjnr   = nlist.jjnr;
304     gmx::ArrayRef<const int> shift  = nlist.shift;
305     gmx::ArrayRef<const int> gid    = nlist.gid;
306
307     const real  lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
308     const real  lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
309     const auto& scParams    = *ic.softCoreParameters;
310     const real gmx_unused alpha_coul    = scParams.alphaCoulomb;
311     const real gmx_unused alpha_vdw     = scParams.alphaVdw;
312     const real            lam_power     = scParams.lambdaPower;
313     const real gmx_unused sigma6_def    = scParams.sigma6WithInvalidSigma;
314     const real gmx_unused sigma6_min    = scParams.sigma6Minimum;
315     const bool gmx_unused doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
316     const bool            doPotential   = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
317
318     // Extract data from interaction_const_t
319     const real            facel           = ic.epsfac;
320     const real            rCoulomb        = ic.rcoulomb;
321     const real            krf             = ic.reactionFieldCoefficient;
322     const real            crf             = ic.reactionFieldShift;
323     const real gmx_unused shLjEwald       = ic.sh_lj_ewald;
324     const real            rVdw            = ic.rvdw;
325     const real            dispersionShift = ic.dispersion_shift.cpot;
326     const real            repulsionShift  = ic.repulsion_shift.cpot;
327     const real            ewaldBeta       = ic.ewaldcoeff_q;
328     real gmx_unused       ewaldLJCoeffSq;
329     real gmx_unused       ewaldLJCoeffSixDivSix;
330     if constexpr (vdwInteractionTypeIsEwald)
331     {
332         ewaldLJCoeffSq        = ic.ewaldcoeff_lj * ic.ewaldcoeff_lj;
333         ewaldLJCoeffSixDivSix = ewaldLJCoeffSq * ewaldLJCoeffSq * ewaldLJCoeffSq / six;
334     }
335
336     // Note that the nbnxm kernels do not support Coulomb potential switching at all
337     GMX_ASSERT(ic.coulomb_modifier != InteractionModifiers::PotSwitch,
338                "Potential switching is not supported for Coulomb with FEP");
339
340     const real      rVdwSwitch = ic.rvdw_switch;
341     real gmx_unused vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
342     if constexpr (vdwModifierIsPotSwitch)
343     {
344         const real d = rVdw - rVdwSwitch;
345         vdw_swV3     = -10.0_real / (d * d * d);
346         vdw_swV4     = 15.0_real / (d * d * d * d);
347         vdw_swV5     = -6.0_real / (d * d * d * d * d);
348         vdw_swF2     = -30.0_real / (d * d * d);
349         vdw_swF3     = 60.0_real / (d * d * d * d);
350         vdw_swF4     = -30.0_real / (d * d * d * d * d);
351     }
352     else
353     {
354         /* Avoid warnings from stupid compilers (looking at you, Clang!) */
355         vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = zero;
356     }
357
358     NbkernelElecType icoul;
359     if (ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype))
360     {
361         icoul = NbkernelElecType::ReactionField;
362     }
363     else
364     {
365         icoul = NbkernelElecType::None;
366     }
367
368     real rcutoff_max2                 = std::max(ic.rcoulomb, ic.rvdw);
369     rcutoff_max2                      = rcutoff_max2 * rcutoff_max2;
370     const real gmx_unused rCutoffCoul = ic.rcoulomb;
371
372     real gmx_unused sh_ewald = 0;
373     if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
374     {
375         sh_ewald = ic.sh_ewald;
376     }
377
378     /* For Ewald/PME interactions we cannot easily apply the soft-core component to
379      * reciprocal space. When we use non-switched Ewald interactions, we
380      * assume the soft-coring does not significantly affect the grid contribution
381      * and apply the soft-core only to the full 1/r (- shift) pair contribution.
382      *
383      * However, we cannot use this approach for switch-modified since we would then
384      * effectively end up evaluating a significantly different interaction here compared to the
385      * normal (non-free-energy) kernels, either by applying a cutoff at a different
386      * position than what the user requested, or by switching different
387      * things (1/r rather than short-range Ewald). For these settings, we just
388      * use the traditional short-range Ewald interaction in that case.
389      */
390     GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
391                        "Can not apply soft-core to switched Ewald potentials");
392
393     const RealType            minDistanceSquared(c_minDistanceSquared);
394     const RealType            maxRInvSix(c_maxRInvSix);
395     const RealType gmx_unused floatMin(GMX_FLOAT_MIN);
396
397     RealType dvdlCoul(zero);
398     RealType dvdlVdw(zero);
399
400     /* Lambda factor for state A, 1-lambda*/
401     real LFC[NSTATES], LFV[NSTATES];
402     LFC[STATE_A] = one - lambda_coul;
403     LFV[STATE_A] = one - lambda_vdw;
404
405     /* Lambda factor for state B, lambda*/
406     LFC[STATE_B] = lambda_coul;
407     LFV[STATE_B] = lambda_vdw;
408
409     /*derivative of the lambda factor for state A and B */
410     real DLF[NSTATES];
411     DLF[STATE_A] = -one;
412     DLF[STATE_B] = one;
413
414     real gmx_unused lFacCoul[NSTATES], dlFacCoul[NSTATES], lFacVdw[NSTATES], dlFacVdw[NSTATES];
415     constexpr real  sc_r_power = six;
416     for (int i = 0; i < NSTATES; i++)
417     {
418         lFacCoul[i]  = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i]));
419         dlFacCoul[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFC[i]) : 1);
420         lFacVdw[i]   = (lam_power == 2 ? (1 - LFV[i]) * (1 - LFV[i]) : (1 - LFV[i]));
421         dlFacVdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
422     }
423
424     // We need pointers to real for SIMD access
425     const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
426     real* gmx_restrict       forceRealPtr;
427     if constexpr (computeForces)
428     {
429         GMX_ASSERT(nri == 0 || !threadForceBuffer.empty(), "need a valid threadForceBuffer");
430
431         forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
432
433         if (doShiftForces)
434         {
435             GMX_ASSERT(threadForceShiftBuffer != nullptr, "need a valid threadForceShiftBuffer");
436         }
437     }
438
439     const real rlistSquared = gmx::square(rlist);
440
441     bool haveExcludedPairsBeyondRlist = false;
442
443     for (int n = 0; n < nri; n++)
444     {
445         bool havePairsWithinCutoff = false;
446
447         const int  is   = shift[n];
448         const real shX  = shiftvec[is][XX];
449         const real shY  = shiftvec[is][YY];
450         const real shZ  = shiftvec[is][ZZ];
451         const int  nj0  = jindex[n];
452         const int  nj1  = jindex[n + 1];
453         const int  ii   = iinr[n];
454         const int  ii3  = 3 * ii;
455         const real ix   = shX + x[ii3 + 0];
456         const real iy   = shY + x[ii3 + 1];
457         const real iz   = shZ + x[ii3 + 2];
458         const real iqA  = facel * chargeA[ii];
459         const real iqB  = facel * chargeB[ii];
460         const int  ntiA = ntype * typeA[ii];
461         const int  ntiB = ntype * typeB[ii];
462         RealType   vCTot(0);
463         RealType   vVTot(0);
464         RealType   fIX(0);
465         RealType   fIY(0);
466         RealType   fIZ(0);
467
468 #if GMX_SIMD_HAVE_REAL
469         alignas(GMX_SIMD_ALIGNMENT) int            preloadIi[DataTypes::simdRealWidth];
470         alignas(GMX_SIMD_ALIGNMENT) int gmx_unused preloadIs[DataTypes::simdRealWidth];
471 #else
472         int            preloadIi[DataTypes::simdRealWidth];
473         int gmx_unused preloadIs[DataTypes::simdRealWidth];
474 #endif
475         for (int s = 0; s < DataTypes::simdRealWidth; s++)
476         {
477             preloadIi[s] = ii;
478             preloadIs[s] = shift[n];
479         }
480         IntType ii_s = gmx::load<IntType>(preloadIi);
481
482         for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
483         {
484             RealType r, rInv;
485
486 #if GMX_SIMD_HAVE_REAL
487             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIsValid[DataTypes::simdRealWidth];
488             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIncluded[DataTypes::simdRealWidth];
489             alignas(GMX_SIMD_ALIGNMENT) int32_t preloadJnr[DataTypes::simdRealWidth];
490             alignas(GMX_SIMD_ALIGNMENT) int32_t typeIndices[NSTATES][DataTypes::simdRealWidth];
491             alignas(GMX_SIMD_ALIGNMENT) real    preloadQq[NSTATES][DataTypes::simdRealWidth];
492             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
493             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
494             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
495             alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
496 #else
497             real            preloadPairIsValid[DataTypes::simdRealWidth];
498             real            preloadPairIncluded[DataTypes::simdRealWidth];
499             int             preloadJnr[DataTypes::simdRealWidth];
500             int             typeIndices[NSTATES][DataTypes::simdRealWidth];
501             real            preloadQq[NSTATES][DataTypes::simdRealWidth];
502             real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
503             real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
504             real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
505             real            preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
506 #endif
507             for (int s = 0; s < DataTypes::simdRealWidth; s++)
508             {
509                 if (k + s < nj1)
510                 {
511                     preloadPairIsValid[s] = true;
512                     /* Check if this pair on the exclusions list.*/
513                     preloadPairIncluded[s]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + s]);
514                     const int jnr           = jjnr[k + s];
515                     preloadJnr[s]           = jnr;
516                     typeIndices[STATE_A][s] = ntiA + typeA[jnr];
517                     typeIndices[STATE_B][s] = ntiB + typeB[jnr];
518                     preloadQq[STATE_A][s]   = iqA * chargeA[jnr];
519                     preloadQq[STATE_B][s]   = iqB * chargeB[jnr];
520
521                     for (int i = 0; i < NSTATES; i++)
522                     {
523                         if constexpr (vdwInteractionTypeIsEwald)
524                         {
525                             preloadLjPmeC6Grid[i][s] = nbfp_grid[2 * typeIndices[i][s]];
526                         }
527                         else
528                         {
529                             preloadLjPmeC6Grid[i][s] = 0;
530                         }
531                         if constexpr (softcoreType == SoftcoreType::Beutler
532                                       || softcoreType == SoftcoreType::Gapsys)
533                         {
534                             const real c6  = nbfp[2 * typeIndices[i][s]];
535                             const real c12 = nbfp[2 * typeIndices[i][s] + 1];
536                             if (c6 > 0 && c12 > 0)
537                             {
538                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
539                                 preloadSigma6[i][s] = 0.5_real * c12 / c6;
540                                 if (preloadSigma6[i][s]
541                                     < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
542                                 {
543                                     preloadSigma6[i][s] = sigma6_min;
544                                 }
545                             }
546                             else
547                             {
548                                 preloadSigma6[i][s] = sigma6_def;
549                             }
550                         }
551                     }
552                     if constexpr (softcoreType == SoftcoreType::Beutler
553                                   || softcoreType == SoftcoreType::Gapsys)
554                     {
555                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
556                         const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1];
557                         const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1];
558                         if (c12A > 0 && c12B > 0)
559                         {
560                             preloadAlphaVdwEff[s]  = 0;
561                             preloadAlphaCoulEff[s] = 0;
562                         }
563                         else
564                         {
565                             if constexpr (softcoreType == SoftcoreType::Beutler)
566                             {
567                                 preloadAlphaVdwEff[s]  = alpha_vdw;
568                                 preloadAlphaCoulEff[s] = alpha_coul;
569                             }
570                             else if constexpr (softcoreType == SoftcoreType::Gapsys)
571                             {
572                                 preloadAlphaVdwEff[s]  = alpha_vdw;
573                                 preloadAlphaCoulEff[s] = gmx::sixthroot(sigma6_def);
574                             }
575                         }
576                     }
577                 }
578                 else
579                 {
580                     preloadJnr[s]          = jjnr[k];
581                     preloadPairIsValid[s]  = false;
582                     preloadPairIncluded[s] = false;
583                     preloadAlphaVdwEff[s]  = 0;
584                     preloadAlphaCoulEff[s] = 0;
585
586                     for (int i = 0; i < NSTATES; i++)
587                     {
588                         typeIndices[STATE_A][s]  = ntiA + typeA[jjnr[k]];
589                         typeIndices[STATE_B][s]  = ntiB + typeB[jjnr[k]];
590                         preloadLjPmeC6Grid[i][s] = 0;
591                         preloadQq[i][s]          = 0;
592                         preloadSigma6[i][s]      = 0;
593                     }
594                 }
595             }
596
597             RealType jx, jy, jz;
598             gmx::gatherLoadUTranspose<3>(reinterpret_cast<const real*>(x), preloadJnr, &jx, &jy, &jz);
599
600             const RealType pairIsValid   = gmx::load<RealType>(preloadPairIsValid);
601             const RealType pairIncluded  = gmx::load<RealType>(preloadPairIncluded);
602             const BoolType bPairIncluded = (pairIncluded != zero);
603             const BoolType bPairExcluded = (pairIncluded == zero && pairIsValid != zero);
604
605             const RealType dX  = ix - jx;
606             const RealType dY  = iy - jy;
607             const RealType dZ  = iz - jz;
608             RealType       rSq = dX * dX + dY * dY + dZ * dZ;
609
610             BoolType withinCutoffMask = (rSq < rcutoff_max2);
611
612             if (!gmx::anyTrue(withinCutoffMask || bPairExcluded))
613             {
614                 /* We save significant time by skipping all code below.
615                  * Note that with soft-core interactions, the actual cut-off
616                  * check might be different. But since the soft-core distance
617                  * is always larger than r, checking on r here is safe.
618                  * Exclusions outside the cutoff can not be skipped as
619                  * when using Ewald: the reciprocal-space
620                  * Ewald component still needs to be subtracted.
621                  */
622                 continue;
623             }
624             else
625             {
626                 havePairsWithinCutoff = true;
627             }
628
629             if (gmx::anyTrue(rlistSquared < rSq && bPairExcluded))
630             {
631                 haveExcludedPairsBeyondRlist = true;
632             }
633
634             const IntType  jnr_s    = gmx::load<IntType>(preloadJnr);
635             const BoolType bIiEqJnr = gmx::cvtIB2B(ii_s == jnr_s);
636
637             RealType            c6[NSTATES];
638             RealType            c12[NSTATES];
639             RealType gmx_unused sigma6[NSTATES];
640             RealType            qq[NSTATES];
641             RealType gmx_unused ljPmeC6Grid[NSTATES];
642             RealType gmx_unused alphaVdwEff;
643             RealType gmx_unused alphaCoulEff;
644             for (int i = 0; i < NSTATES; i++)
645             {
646                 gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]);
647                 qq[i]          = gmx::load<RealType>(preloadQq[i]);
648                 ljPmeC6Grid[i] = gmx::load<RealType>(preloadLjPmeC6Grid[i]);
649                 if constexpr (softcoreType == SoftcoreType::Beutler || softcoreType == SoftcoreType::Gapsys)
650                 {
651                     sigma6[i] = gmx::load<RealType>(preloadSigma6[i]);
652                 }
653             }
654             if constexpr (softcoreType == SoftcoreType::Beutler || softcoreType == SoftcoreType::Gapsys)
655             {
656                 alphaVdwEff  = gmx::load<RealType>(preloadAlphaVdwEff);
657                 alphaCoulEff = gmx::load<RealType>(preloadAlphaCoulEff);
658             }
659
660             // Avoid overflow of r^-12 at distances near zero
661             rSq  = gmx::max(rSq, minDistanceSquared);
662             rInv = gmx::invsqrt(rSq);
663             r    = rSq * rInv;
664
665             RealType gmx_unused rp, rpm2;
666             if constexpr (softcoreType == SoftcoreType::Beutler)
667             {
668                 rpm2 = rSq * rSq;  /* r4 */
669                 rp   = rpm2 * rSq; /* r6 */
670             }
671             else
672             {
673                 /* The soft-core power p will not affect the results
674                  * with not using soft-core, so we use power of 0 which gives
675                  * the simplest math and cheapest code.
676                  */
677                 rpm2 = rInv * rInv;
678                 rp   = one;
679             }
680
681             RealType fScal(0);
682
683             /* The following block is masked to only calculate values having bPairIncluded. If
684              * bPairIncluded is true then withinCutoffMask must also be true. */
685             if (gmx::anyTrue(withinCutoffMask && bPairIncluded))
686             {
687                 RealType gmx_unused fScalC[NSTATES], fScalV[NSTATES];
688                 RealType            vCoul[NSTATES], vVdw[NSTATES];
689                 for (int i = 0; i < NSTATES; i++)
690                 {
691                     fScalC[i] = zero;
692                     fScalV[i] = zero;
693                     vCoul[i]  = zero;
694                     vVdw[i]   = zero;
695
696                     RealType gmx_unused rInvC, rInvV, rC, rV, rPInvC, rPInvV;
697
698                     /* The following block is masked to require (qq[i] != 0 || c6[i] != 0 || c12[i]
699                      * != 0) in addition to bPairIncluded, which in turn requires withinCutoffMask. */
700                     BoolType nonZeroState = ((qq[i] != zero || c6[i] != zero || c12[i] != zero)
701                                              && bPairIncluded && withinCutoffMask);
702                     if (gmx::anyTrue(nonZeroState))
703                     {
704                         if constexpr (softcoreType == SoftcoreType::Beutler)
705                         {
706                             RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
707                             rPInvC           = gmx::inv(divisor);
708                             sixthRoot(rPInvC, &rInvC, &rC);
709
710                             if constexpr (scLambdasOrAlphasDiffer)
711                             {
712                                 RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
713                                 rPInvV           = gmx::inv(divisor);
714                                 sixthRoot(rPInvV, &rInvV, &rV);
715                             }
716                             else
717                             {
718                                 /* We can avoid one expensive pow and one / operation */
719                                 rPInvV = rPInvC;
720                                 rInvV  = rInvC;
721                                 rV     = rC;
722                             }
723                         }
724                         else
725                         {
726                             rPInvC = one;
727                             rInvC  = rInv;
728                             rC     = r;
729
730                             rPInvV = one;
731                             rInvV  = rInv;
732                             rV     = r;
733                         }
734
735                         /* Only process the coulomb interactions if we either
736                          * include all entries in the list (no cutoff
737                          * used in the kernel), or if we are within the cutoff.
738                          */
739                         BoolType computeElecInteraction;
740                         if constexpr (elecInteractionTypeIsEwald)
741                         {
742                             computeElecInteraction = (r < rCoulomb && qq[i] != zero && bPairIncluded);
743                         }
744                         else
745                         {
746                             computeElecInteraction = (rC < rCoulomb && qq[i] != zero && bPairIncluded);
747                         }
748                         if (gmx::anyTrue(computeElecInteraction))
749                         {
750                             if constexpr (elecInteractionTypeIsEwald)
751                             {
752                                 vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald);
753                                 if constexpr (computeForces)
754                                 {
755                                     fScalC[i] = ewaldScalarForce(qq[i], rInvC);
756                                 }
757
758                                 if constexpr (softcoreType == SoftcoreType::Gapsys)
759                                 {
760                                     ewaldQuadraticPotential(qq[i],
761                                                             facel,
762                                                             rC,
763                                                             rCutoffCoul,
764                                                             LFC[i],
765                                                             DLF[i],
766                                                             alphaCoulEff,
767                                                             sh_ewald,
768                                                             &fScalC[i],
769                                                             &vCoul[i],
770                                                             &dvdlCoul,
771                                                             computeElecInteraction);
772                                 }
773                             }
774                             else
775                             {
776                                 vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
777                                 if constexpr (computeForces)
778                                 {
779                                     fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
780                                 }
781
782                                 if constexpr (softcoreType == SoftcoreType::Gapsys)
783                                 {
784                                     reactionFieldQuadraticPotential(qq[i],
785                                                                     facel,
786                                                                     rC,
787                                                                     rCutoffCoul,
788                                                                     LFC[i],
789                                                                     DLF[i],
790                                                                     alphaCoulEff,
791                                                                     krf,
792                                                                     crf,
793                                                                     &fScalC[i],
794                                                                     &vCoul[i],
795                                                                     &dvdlCoul,
796                                                                     computeElecInteraction);
797                                 }
798                             }
799
800                             vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction);
801                             if constexpr (computeForces)
802                             {
803                                 fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
804                             }
805                         }
806
807                         /* Only process the VDW interactions if we either
808                          * include all entries in the list (no cutoff used
809                          * in the kernel), or if we are within the cutoff.
810                          */
811                         BoolType computeVdwInteraction;
812                         if constexpr (vdwInteractionTypeIsEwald)
813                         {
814                             computeVdwInteraction =
815                                     (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
816                         }
817                         else
818                         {
819                             computeVdwInteraction =
820                                     (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
821                         }
822                         if (gmx::anyTrue(computeVdwInteraction))
823                         {
824                             RealType rInv6;
825                             if constexpr (softcoreType == SoftcoreType::Beutler)
826                             {
827                                 rInv6 = rPInvV;
828                             }
829                             else
830                             {
831                                 rInv6 = calculateRinv6(rInvV);
832                             }
833                             // Avoid overflow at short distance for masked exclusions and
834                             // for foreign energy calculations at a hard core end state.
835                             // Note that we should limit r^-6, and thus also r^-12, and
836                             // not only r^-12, as that could lead to erroneously low instead
837                             // of very high foreign energies.
838                             rInv6           = gmx::min(rInv6, maxRInvSix);
839                             RealType vVdw6  = calculateVdw6(c6[i], rInv6);
840                             RealType vVdw12 = calculateVdw12(c12[i], rInv6);
841
842                             vVdw[i] = lennardJonesPotential(
843                                     vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth);
844                             if constexpr (computeForces)
845                             {
846                                 fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
847                             }
848
849                             if constexpr (softcoreType == SoftcoreType::Gapsys)
850                             {
851                                 lennardJonesQuadraticPotential(c6[i],
852                                                                c12[i],
853                                                                r,
854                                                                rSq,
855                                                                LFV[i],
856                                                                DLF[i],
857                                                                sigma6[i],
858                                                                alphaVdwEff,
859                                                                repulsionShift,
860                                                                dispersionShift,
861                                                                &fScalV[i],
862                                                                &vVdw[i],
863                                                                &dvdlVdw,
864                                                                computeVdwInteraction);
865                             }
866
867                             if constexpr (vdwInteractionTypeIsEwald)
868                             {
869                                 /* Subtract the grid potential at the cut-off */
870                                 vVdw[i] = vVdw[i]
871                                           + gmx::selectByMask(ewaldLennardJonesGridSubtract(
872                                                                       ljPmeC6Grid[i], shLjEwald, oneSixth),
873                                                               computeVdwInteraction);
874                             }
875
876                             if constexpr (vdwModifierIsPotSwitch)
877                             {
878                                 RealType d             = rV - rVdwSwitch;
879                                 BoolType zeroMask      = zero < d;
880                                 BoolType potSwitchMask = rV < rVdw;
881                                 d                      = gmx::selectByMask(d, zeroMask);
882                                 const RealType d2      = d * d;
883                                 const RealType sw =
884                                         one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
885
886                                 if constexpr (computeForces)
887                                 {
888                                     const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
889                                     fScalV[i]          = potSwitchScalarForceMod(
890                                             fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
891                                 }
892                                 vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask);
893                             }
894
895                             vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction);
896                             if constexpr (computeForces)
897                             {
898                                 fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
899                             }
900                         }
901
902                         if constexpr (computeForces)
903                         {
904                             /* fScalC (and fScalV) now contain: dV/drC * rC
905                              * Now we multiply by rC^-6, so it will be: dV/drC * rC^-5
906                              * Further down we first multiply by r^4 and then by
907                              * the vector r, which in total gives: dV/drC * (r/rC)^-5
908                              */
909                             fScalC[i] = fScalC[i] * rPInvC;
910                             fScalV[i] = fScalV[i] * rPInvV;
911                         }
912                     } // end of block requiring nonZeroState
913                 }     // end for (int i = 0; i < NSTATES; i++)
914
915                 /* Assemble A and B states. */
916                 BoolType assembleStates = (bPairIncluded && withinCutoffMask);
917                 if (gmx::anyTrue(assembleStates))
918                 {
919                     for (int i = 0; i < NSTATES; i++)
920                     {
921                         vCTot = vCTot + LFC[i] * vCoul[i];
922                         vVTot = vVTot + LFV[i] * vVdw[i];
923
924                         if constexpr (computeForces)
925                         {
926                             fScal = fScal + LFC[i] * fScalC[i] * rpm2;
927                             fScal = fScal + LFV[i] * fScalV[i] * rpm2;
928                         }
929
930                         if constexpr (softcoreType == SoftcoreType::Beutler)
931                         {
932                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]
933                                        + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i];
934                             dvdlVdw = dvdlVdw + vVdw[i] * DLF[i]
935                                       + LFV[i] * alphaVdwEff * dlFacVdw[i] * fScalV[i] * sigma6[i];
936                         }
937                         else
938                         {
939                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i];
940                             dvdlVdw  = dvdlVdw + vVdw[i] * DLF[i];
941                         }
942                     }
943                 }
944             } // end of block requiring bPairIncluded && withinCutoffMask
945             /* In the following block bPairIncluded should be false in the masks. */
946             if (icoul == NbkernelElecType::ReactionField)
947             {
948                 const BoolType computeReactionField = bPairExcluded;
949
950                 if (gmx::anyTrue(computeReactionField))
951                 {
952                     /* For excluded pairs we don't use soft-core.
953                      * As there is no singularity, there is no need for soft-core.
954                      */
955                     const RealType FF = -two * krf;
956                     RealType       VV = krf * rSq - crf;
957
958                     /* If ii == jnr the i particle (ii) has itself (jnr)
959                      * in its neighborlist. This corresponds to a self-interaction
960                      * that will occur twice. Scale it down by 50% to only include
961                      * it once.
962                      */
963                     VV = VV * gmx::blend(one, half, bIiEqJnr);
964
965                     for (int i = 0; i < NSTATES; i++)
966                     {
967                         vCTot = vCTot + gmx::selectByMask(LFC[i] * qq[i] * VV, computeReactionField);
968                         fScal = fScal + gmx::selectByMask(LFC[i] * qq[i] * FF, computeReactionField);
969                         dvdlCoul = dvdlCoul + gmx::selectByMask(DLF[i] * qq[i] * VV, computeReactionField);
970                     }
971                 }
972             }
973
974             const BoolType computeElecEwaldInteraction = (bPairExcluded || r < rCoulomb);
975             if (elecInteractionTypeIsEwald && gmx::anyTrue(computeElecEwaldInteraction))
976             {
977                 /* See comment in the preamble. When using Ewald interactions
978                  * (unless we use a switch modifier) we subtract the reciprocal-space
979                  * Ewald component here which made it possible to apply the free
980                  * energy interaction to 1/r (vanilla coulomb short-range part)
981                  * above. This gets us closer to the ideal case of applying
982                  * the softcore to the entire electrostatic interaction,
983                  * including the reciprocal-space component.
984                  */
985                 RealType v_lr, f_lr;
986
987                 pmeCoulombCorrectionVF<computeForces>(rSq, ewaldBeta, &v_lr, &f_lr);
988                 if constexpr (computeForces)
989                 {
990                     f_lr = f_lr * rInv * rInv;
991                 }
992
993                 /* Note that any possible Ewald shift has already been applied in
994                  * the normal interaction part above.
995                  */
996
997                 /* If ii == jnr the i particle (ii) has itself (jnr)
998                  * in its neighborlist. This corresponds to a self-interaction
999                  * that will occur twice. Scale it down by 50% to only include
1000                  * it once.
1001                  */
1002                 v_lr = v_lr * gmx::blend(one, half, bIiEqJnr);
1003
1004                 for (int i = 0; i < NSTATES; i++)
1005                 {
1006                     vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1007                     if constexpr (computeForces)
1008                     {
1009                         fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
1010                     }
1011                     dvdlCoul = dvdlCoul
1012                                - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1013                 }
1014             }
1015
1016             const BoolType computeVdwEwaldInteraction = (bPairExcluded || r < rVdw);
1017             if (vdwInteractionTypeIsEwald && gmx::anyTrue(computeVdwEwaldInteraction))
1018             {
1019                 /* See comment in the preamble. When using LJ-Ewald interactions
1020                  * (unless we use a switch modifier) we subtract the reciprocal-space
1021                  * Ewald component here which made it possible to apply the free
1022                  * energy interaction to r^-6 (vanilla LJ6 short-range part)
1023                  * above. This gets us closer to the ideal case of applying
1024                  * the softcore to the entire VdW interaction,
1025                  * including the reciprocal-space component.
1026                  */
1027
1028                 RealType v_lr, f_lr;
1029                 pmeLJCorrectionVF<computeForces>(
1030                         rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr);
1031                 v_lr = v_lr * oneSixth;
1032
1033                 for (int i = 0; i < NSTATES; i++)
1034                 {
1035                     vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1036                     if constexpr (computeForces)
1037                     {
1038                         fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
1039                     }
1040                     dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1041                 }
1042             }
1043
1044             if (computeForces && gmx::anyTrue(fScal != zero))
1045             {
1046                 const RealType tX = fScal * dX;
1047                 const RealType tY = fScal * dY;
1048                 const RealType tZ = fScal * dZ;
1049                 fIX               = fIX + tX;
1050                 fIY               = fIY + tY;
1051                 fIZ               = fIZ + tZ;
1052
1053                 gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
1054             }
1055         } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
1056
1057         if (havePairsWithinCutoff)
1058         {
1059             if constexpr (computeForces)
1060             {
1061                 gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
1062
1063                 if (doShiftForces)
1064                 {
1065                     gmx::transposeScatterIncrU<3>(
1066                             reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
1067                 }
1068             }
1069             if (doPotential)
1070             {
1071                 int ggid = gid[n];
1072                 threadVc[ggid] += gmx::reduce(vCTot);
1073                 threadVv[ggid] += gmx::reduce(vVTot);
1074             }
1075         }
1076     } // end for (int n = 0; n < nri; n++)
1077
1078     if (gmx::anyTrue(dvdlCoul != zero))
1079     {
1080         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)] += gmx::reduce(dvdlCoul);
1081     }
1082     if (gmx::anyTrue(dvdlVdw != zero))
1083     {
1084         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)] += gmx::reduce(dvdlVdw);
1085     }
1086
1087     /* Estimate flops, average for free energy stuff:
1088      * 12  flops per outer iteration
1089      * 150 flops per inner iteration
1090      * TODO: Update the number of flops and/or use different counts for different code paths.
1091      */
1092     atomicNrnbIncrement(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist.nri * 12 + nlist.jindex[nri] * 150);
1093
1094     if (haveExcludedPairsBeyondRlist > 0)
1095     {
1096         gmx_fatal(FARGS,
1097                   "There are perturbed non-bonded pair interactions beyond the pair-list cutoff "
1098                   "of %g nm, which is not supported. This can happen because the system is "
1099                   "unstable or because intra-molecular interactions at long distances are "
1100                   "excluded. If the "
1101                   "latter is the case, you can try to increase nstlist or rlist to avoid this."
1102                   "The error is likely triggered by the use of couple-intramol=no "
1103                   "and the maximal distance in the decoupled molecule exceeding rlist.",
1104                   rlist);
1105     }
1106 }
1107
1108 typedef void (*KernelFunction)(const t_nblist&                                  nlist,
1109                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1110                                const int                                        ntype,
1111                                const real                                       rlist,
1112                                const interaction_const_t&                       ic,
1113                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1114                                gmx::ArrayRef<const real>                        nbfp,
1115                                gmx::ArrayRef<const real>                        nbfp_grid,
1116                                gmx::ArrayRef<const real>                        chargeA,
1117                                gmx::ArrayRef<const real>                        chargeB,
1118                                gmx::ArrayRef<const int>                         typeA,
1119                                gmx::ArrayRef<const int>                         typeB,
1120                                int                                              flags,
1121                                gmx::ArrayRef<const real>                        lambda,
1122                                t_nrnb* gmx_restrict                             nrnb,
1123                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1124                                rvec*               threadForceShiftBuffer,
1125                                gmx::ArrayRef<real> threadVc,
1126                                gmx::ArrayRef<real> threadVv,
1127                                gmx::ArrayRef<real> threadDvdl);
1128
1129 template<SoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
1130 static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
1131 {
1132     if (useSimd)
1133     {
1134 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
1135         return (nb_free_energy_kernel<SimdDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1136 #else
1137         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch computeForces>);
1138 #endif
1139     }
1140     else
1141     {
1142         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1143     }
1144 }
1145
1146 template<SoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
1147 static KernelFunction dispatchKernelOnComputeForces(const bool computeForces, const bool useSimd)
1148 {
1149     if (computeForces)
1150     {
1151         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, true>(
1152                 useSimd));
1153     }
1154     else
1155     {
1156         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, false>(
1157                 useSimd));
1158     }
1159 }
1160
1161 template<SoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
1162 static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch,
1163                                                   const bool computeForces,
1164                                                   const bool useSimd)
1165 {
1166     if (vdwModifierIsPotSwitch)
1167     {
1168         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
1169                 computeForces, useSimd));
1170     }
1171     else
1172     {
1173         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
1174                 computeForces, useSimd));
1175     }
1176 }
1177
1178 template<SoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
1179 static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
1180                                                           const bool vdwModifierIsPotSwitch,
1181                                                           const bool computeForces,
1182                                                           const bool useSimd)
1183 {
1184     if (elecInteractionTypeIsEwald)
1185     {
1186         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
1187                 vdwModifierIsPotSwitch, computeForces, useSimd));
1188     }
1189     else
1190     {
1191         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
1192                 vdwModifierIsPotSwitch, computeForces, useSimd));
1193     }
1194 }
1195
1196 template<SoftcoreType softcoreType, bool scLambdasOrAlphasDiffer>
1197 static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
1198                                                          const bool elecInteractionTypeIsEwald,
1199                                                          const bool vdwModifierIsPotSwitch,
1200                                                          const bool computeForces,
1201                                                          const bool useSimd)
1202 {
1203     if (vdwInteractionTypeIsEwald)
1204     {
1205         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, true>(
1206                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1207     }
1208     else
1209     {
1210         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, false>(
1211                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1212     }
1213 }
1214
1215 template<SoftcoreType softcoreType>
1216 static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
1217                                                                   const bool vdwInteractionTypeIsEwald,
1218                                                                   const bool elecInteractionTypeIsEwald,
1219                                                                   const bool vdwModifierIsPotSwitch,
1220                                                                   const bool computeForces,
1221                                                                   const bool useSimd)
1222 {
1223     if (scLambdasOrAlphasDiffer)
1224     {
1225         return (dispatchKernelOnVdwInteractionType<softcoreType, true>(
1226                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1227     }
1228     else
1229     {
1230         return (dispatchKernelOnVdwInteractionType<softcoreType, false>(
1231                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1232     }
1233 }
1234
1235 static KernelFunction dispatchKernel(const bool                 scLambdasOrAlphasDiffer,
1236                                      const bool                 vdwInteractionTypeIsEwald,
1237                                      const bool                 elecInteractionTypeIsEwald,
1238                                      const bool                 vdwModifierIsPotSwitch,
1239                                      const bool                 computeForces,
1240                                      const bool                 useSimd,
1241                                      const interaction_const_t& ic)
1242 {
1243     if ((ic.softCoreParameters->alphaCoulomb == 0 && ic.softCoreParameters->alphaVdw == 0)
1244         || ic.softCoreParameters->softcoreType == SoftcoreType::None)
1245     {
1246         return (dispatchKernelOnScLambdasOrAlphasDifference<SoftcoreType::None>(scLambdasOrAlphasDiffer,
1247                                                                                 vdwInteractionTypeIsEwald,
1248                                                                                 elecInteractionTypeIsEwald,
1249                                                                                 vdwModifierIsPotSwitch,
1250                                                                                 computeForces,
1251                                                                                 useSimd));
1252     }
1253     else
1254     {
1255         if (ic.softCoreParameters->softcoreType == SoftcoreType::Beutler)
1256         {
1257             return (dispatchKernelOnScLambdasOrAlphasDifference<SoftcoreType::Beutler>(
1258                     scLambdasOrAlphasDiffer,
1259                     vdwInteractionTypeIsEwald,
1260                     elecInteractionTypeIsEwald,
1261                     vdwModifierIsPotSwitch,
1262                     computeForces,
1263                     useSimd));
1264         }
1265         else
1266         {
1267             return (dispatchKernelOnScLambdasOrAlphasDifference<SoftcoreType::Gapsys>(
1268                     scLambdasOrAlphasDiffer,
1269                     vdwInteractionTypeIsEwald,
1270                     elecInteractionTypeIsEwald,
1271                     vdwModifierIsPotSwitch,
1272                     computeForces,
1273                     useSimd));
1274         }
1275     }
1276 }
1277
1278
1279 void gmx_nb_free_energy_kernel(const t_nblist&                                  nlist,
1280                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1281                                const bool                                       useSimd,
1282                                const int                                        ntype,
1283                                const real                                       rlist,
1284                                const interaction_const_t&                       ic,
1285                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1286                                gmx::ArrayRef<const real>                        nbfp,
1287                                gmx::ArrayRef<const real>                        nbfp_grid,
1288                                gmx::ArrayRef<const real>                        chargeA,
1289                                gmx::ArrayRef<const real>                        chargeB,
1290                                gmx::ArrayRef<const int>                         typeA,
1291                                gmx::ArrayRef<const int>                         typeB,
1292                                int                                              flags,
1293                                gmx::ArrayRef<const real>                        lambda,
1294                                t_nrnb*                                          nrnb,
1295                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1296                                rvec*               threadForceShiftBuffer,
1297                                gmx::ArrayRef<real> threadVc,
1298                                gmx::ArrayRef<real> threadVv,
1299                                gmx::ArrayRef<real> threadDvdl)
1300 {
1301     GMX_ASSERT(EEL_PME_EWALD(ic.eeltype) || ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype),
1302                "Unsupported eeltype with free energy");
1303     GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
1304
1305     // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
1306     GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
1307                        || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
1308                "We need actual padding with at least one element for SIMD scatter operations");
1309
1310     const auto& scParams                   = *ic.softCoreParameters;
1311     const bool  vdwInteractionTypeIsEwald  = (EVDW_PME(ic.vdwtype));
1312     const bool  elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
1313     const bool  vdwModifierIsPotSwitch     = (ic.vdw_modifier == InteractionModifiers::PotSwitch);
1314     const bool  computeForces              = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
1315     bool        scLambdasOrAlphasDiffer    = true;
1316
1317     if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
1318     {
1319         scLambdasOrAlphasDiffer = false;
1320     }
1321     else
1322     {
1323         if (lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1324                     == lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]
1325             && scParams.alphaCoulomb == scParams.alphaVdw)
1326         {
1327             scLambdasOrAlphasDiffer = false;
1328         }
1329     }
1330
1331     KernelFunction kernelFunc;
1332     kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer,
1333                                 vdwInteractionTypeIsEwald,
1334                                 elecInteractionTypeIsEwald,
1335                                 vdwModifierIsPotSwitch,
1336                                 computeForces,
1337                                 useSimd,
1338                                 ic);
1339     kernelFunc(nlist,
1340                coords,
1341                ntype,
1342                rlist,
1343                ic,
1344                shiftvec,
1345                nbfp,
1346                nbfp_grid,
1347                chargeA,
1348                chargeB,
1349                typeA,
1350                typeB,
1351                flags,
1352                lambda,
1353                nrnb,
1354                threadForceBuffer,
1355                threadForceShiftBuffer,
1356                threadVc,
1357                threadVv,
1358                threadDvdl);
1359 }