src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "nb_free_energy.h"
  41
  42 #include "config.h"
  43
  44 #include <cmath>
  45 #include <set>
  46
  47 #include <algorithm>
  48
  49 #include "gromacs/gmxlib/nrnb.h"
  50 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  51 #include "gromacs/math/arrayrefwithpadding.h"
  52 #include "gromacs/math/functions.h"
  53 #include "gromacs/math/vec.h"
  54 #include "gromacs/mdtypes/forceoutput.h"
  55 #include "gromacs/mdtypes/forcerec.h"
  56 #include "gromacs/mdtypes/interaction_const.h"
  57 #include "gromacs/mdtypes/md_enums.h"
  58 #include "gromacs/mdtypes/mdatom.h"
  59 #include "gromacs/mdtypes/nblist.h"
  60 #include "gromacs/pbcutil/ishift.h"
  61 #include "gromacs/simd/simd.h"
  62 #include "gromacs/simd/simd_math.h"
  63 #include "gromacs/utility/fatalerror.h"
  64 #include "gromacs/utility/arrayref.h"
  65
  66 #include "nb_softcore.h"
  67
  68 //! Scalar (non-SIMD) data types.
  69 struct ScalarDataTypes
  70 {
  71     using RealType = real; //!< The data type to use as real.
  72     using IntType  = int;  //!< The data type to use as int.
  73     using BoolType = bool; //!< The data type to use as bool for real value comparison.
  74     static constexpr int simdRealWidth = 1; //!< The width of the RealType.
  75     static constexpr int simdIntWidth  = 1; //!< The width of the IntType.
  76 };
  77
  78 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
  79 //! SIMD data types.
  80 struct SimdDataTypes
  81 {
  82     using RealType = gmx::SimdReal;  //!< The data type to use as real.
  83     using IntType  = gmx::SimdInt32; //!< The data type to use as int.
  84     using BoolType = gmx::SimdBool;  //!< The data type to use as bool for real value comparison.
  85     static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH; //!< The width of the RealType.
  86 #    if GMX_SIMD_HAVE_DOUBLE && GMX_DOUBLE
  87     static constexpr int simdIntWidth = GMX_SIMD_DINT32_WIDTH; //!< The width of the IntType.
  88 #    else
  89     static constexpr int simdIntWidth = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType.
  90 #    endif
  91 };
  92 #endif
  93
  94 /*! \brief Lower limit for square interaction distances in nonbonded kernels.
  95  *
  96  * This is a mimimum on r^2 to avoid overflows when computing r^6.
  97  * This will only affect results for soft-cored interaction at distances smaller
  98  * than 1e-6 and will limit extremely high foreign energies for overlapping atoms.
  99  * Note that we could use a somewhat smaller minimum in double precision.
 100  * But because invsqrt in double precision can use single precision, this number
 101  * can not be much smaller, we use the same number for simplicity.
 102  */
 103 constexpr real c_minDistanceSquared = 1.0e-12_real;
 104
 105 /*! \brief Higher limit for r^-6 used for Lennard-Jones interactions
 106  *
 107  * This is needed to avoid overflow of LJ energy and force terms for excluded
 108  * atoms and foreign energies of hard-core states of overlapping atoms.
 109  * Note that in single precision this value leaves room for C12 coefficients up to 3.4e8.
 110  */
 111 constexpr real c_maxRInvSix = 1.0e15_real;
 112
 113 template<bool computeForces, class RealType>
 114 static inline void
 115 pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType gmx_unused* force)
 116 {
 117     const RealType brsq = rSq * beta * beta;
 118     if constexpr (computeForces)
 119     {
 120         *force = -brsq * beta * gmx::pmeForceCorrection(brsq);
 121     }
 122     *pot = beta * gmx::pmePotentialCorrection(brsq);
 123 }
 124
 125 template<bool computeForces, class RealType, class BoolType>
 126 static inline void pmeLJCorrectionVF(const RealType rInv,
 127                                      const RealType rSq,
 128                                      const real     ewaldLJCoeffSq,
 129                                      const real     ewaldLJCoeffSixDivSix,
 130                                      RealType*      pot,
 131                                      RealType gmx_unused* force,
 132                                      const BoolType       mask,
 133                                      const BoolType       bIiEqJnr)
 134 {
 135     // We mask rInv to get zero force and potential for masked out pair interactions
 136     const RealType rInvSq  = rInv * rInv;
 137     const RealType rInvSix = rInvSq * rInvSq * rInvSq;
 138     // Mask rSq to avoid underflow in exp()
 139     const RealType coeffSqRSq       = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask);
 140     const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq);
 141     const RealType poly             = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq;
 142     if constexpr (computeForces)
 143     {
 144         *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
 145         *force = *force * rInvSq;
 146     }
 147     // The self interaction is the limit for r -> 0 which we need to compute separately
 148     *pot = gmx::blend(
 149             rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr);
 150 }
 151
 152 //! Computes r^(1/6) and 1/r^(1/6)
 153 template<class RealType>
 154 static inline void sixthRoot(const RealType r, RealType* sixthRoot, RealType* invSixthRoot)
 155 {
 156     RealType cbrtRes = gmx::cbrt(r);
 157     *invSixthRoot    = gmx::invsqrt(cbrtRes);
 158     *sixthRoot       = gmx::inv(*invSixthRoot);
 159 }
 160
 161 template<class RealType>
 162 static inline RealType calculateRinv6(const RealType rInvV)
 163 {
 164     RealType rInv6 = rInvV * rInvV;
 165     return (rInv6 * rInv6 * rInv6);
 166 }
 167
 168 template<class RealType>
 169 static inline RealType calculateVdw6(const RealType c6, const RealType rInv6)
 170 {
 171     return (c6 * rInv6);
 172 }
 173
 174 template<class RealType>
 175 static inline RealType calculateVdw12(const RealType c12, const RealType rInv6)
 176 {
 177     return (c12 * rInv6 * rInv6);
 178 }
 179
 180 /* reaction-field electrostatics */
 181 template<class RealType>
 182 static inline RealType reactionFieldScalarForce(const RealType qq,
 183                                                 const RealType rInv,
 184                                                 const RealType r,
 185                                                 const real     krf,
 186                                                 const real     two)
 187 {
 188     return (qq * (rInv - two * krf * r * r));
 189 }
 190 template<class RealType>
 191 static inline RealType reactionFieldPotential(const RealType qq,
 192                                               const RealType rInv,
 193                                               const RealType r,
 194                                               const real     krf,
 195                                               const real     potentialShift)
 196 {
 197     return (qq * (rInv + krf * r * r - potentialShift));
 198 }
 199
 200 /* Ewald electrostatics */
 201 template<class RealType>
 202 static inline RealType ewaldScalarForce(const RealType coulomb, const RealType rInv)
 203 {
 204     return (coulomb * rInv);
 205 }
 206 template<class RealType>
 207 static inline RealType ewaldPotential(const RealType coulomb, const RealType rInv, const real potentialShift)
 208 {
 209     return (coulomb * (rInv - potentialShift));
 210 }
 211
 212 /* cutoff LJ */
 213 template<class RealType>
 214 static inline RealType lennardJonesScalarForce(const RealType v6, const RealType v12)
 215 {
 216     return (v12 - v6);
 217 }
 218 template<class RealType>
 219 static inline RealType lennardJonesPotential(const RealType v6,
 220                                              const RealType v12,
 221                                              const RealType c6,
 222                                              const RealType c12,
 223                                              const real     repulsionShift,
 224                                              const real     dispersionShift,
 225                                              const real     oneSixth,
 226                                              const real     oneTwelfth)
 227 {
 228     return ((v12 + c12 * repulsionShift) * oneTwelfth - (v6 + c6 * dispersionShift) * oneSixth);
 229 }
 230
 231 /* Ewald LJ */
 232 template<class RealType>
 233 static inline RealType ewaldLennardJonesGridSubtract(const RealType c6grid,
 234                                                      const real     potentialShift,
 235                                                      const real     oneSixth)
 236 {
 237     return (c6grid * potentialShift * oneSixth);
 238 }
 239
 240 /* LJ Potential switch */
 241 template<class RealType, class BoolType>
 242 static inline RealType potSwitchScalarForceMod(const RealType fScalarInp,
 243                                                const RealType potential,
 244                                                const RealType sw,
 245                                                const RealType r,
 246                                                const RealType dsw,
 247                                                const BoolType mask)
 248 {
 249     /* The mask should select on rV < rVdw */
 250     return (gmx::selectByMask(fScalarInp * sw - r * potential * dsw, mask));
 251 }
 252 template<class RealType, class BoolType>
 253 static inline RealType potSwitchPotentialMod(const RealType potentialInp, const RealType sw, const BoolType mask)
 254 {
 255     /* The mask should select on rV < rVdw */
 256     return (gmx::selectByMask(potentialInp * sw, mask));
 257 }
 258
 259
 260 //! Templated free-energy non-bonded kernel
 261 template<typename DataTypes, KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
 262 static void nb_free_energy_kernel(const t_nblist&                                  nlist,
 263                                   const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
 264                                   const int                                        ntype,
 265                                   const real                                       rlist,
 266                                   const interaction_const_t&                       ic,
 267                                   gmx::ArrayRef<const gmx::RVec>                   shiftvec,
 268                                   gmx::ArrayRef<const real>                        nbfp,
 269                                   gmx::ArrayRef<const real> gmx_unused             nbfp_grid,
 270                                   gmx::ArrayRef<const real>                        chargeA,
 271                                   gmx::ArrayRef<const real>                        chargeB,
 272                                   gmx::ArrayRef<const int>                         typeA,
 273                                   gmx::ArrayRef<const int>                         typeB,
 274                                   int                                              flags,
 275                                   gmx::ArrayRef<const real>                        lambda,
 276                                   t_nrnb* gmx_restrict                             nrnb,
 277                                   gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
 278                                   rvec gmx_unused*    threadForceShiftBuffer,
 279                                   gmx::ArrayRef<real> threadVc,
 280                                   gmx::ArrayRef<real> threadVv,
 281                                   gmx::ArrayRef<real> threadDvdl)
 282 {
 283 #define STATE_A 0
 284 #define STATE_B 1
 285 #define NSTATES 2
 286
 287     using RealType = typename DataTypes::RealType;
 288     using IntType  = typename DataTypes::IntType;
 289     using BoolType = typename DataTypes::BoolType;
 290
 291     constexpr real oneTwelfth = 1.0_real / 12.0_real;
 292     constexpr real oneSixth   = 1.0_real / 6.0_real;
 293     constexpr real zero       = 0.0_real;
 294     constexpr real half       = 0.5_real;
 295     constexpr real one        = 1.0_real;
 296     constexpr real two        = 2.0_real;
 297     constexpr real six        = 6.0_real;
 298
 299     // Extract pair list data
 300     const int                nri    = nlist.nri;
 301     gmx::ArrayRef<const int> iinr   = nlist.iinr;
 302     gmx::ArrayRef<const int> jindex = nlist.jindex;
 303     gmx::ArrayRef<const int> jjnr   = nlist.jjnr;
 304     gmx::ArrayRef<const int> shift  = nlist.shift;
 305     gmx::ArrayRef<const int> gid    = nlist.gid;
 306
 307     const real lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
 308     const real lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
 309
 310     // Extract softcore parameters
 311     const auto&           scParams   = *ic.softCoreParameters;
 312     const real            lam_power  = scParams.lambdaPower;
 313     const real gmx_unused alpha_coul = scParams.alphaCoulomb;
 314     const real gmx_unused alpha_vdw  = scParams.alphaVdw;
 315     const real gmx_unused sigma6_def = scParams.sigma6WithInvalidSigma;
 316     const real gmx_unused sigma6_min = scParams.sigma6Minimum;
 317
 318     const real gmx_unused gapsysScaleLinpointCoul = scParams.gapsysScaleLinpointCoul;
 319     const real gmx_unused gapsysScaleLinpointVdW  = scParams.gapsysScaleLinpointVdW;
 320     const real gmx_unused gapsysSigma6VdW         = scParams.gapsysSigma6VdW;
 321
 322     const bool gmx_unused doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
 323     const bool            doPotential   = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
 324
 325     // Extract data from interaction_const_t
 326     const real            facel           = ic.epsfac;
 327     const real            rCoulomb        = ic.rcoulomb;
 328     const real            krf             = ic.reactionFieldCoefficient;
 329     const real            crf             = ic.reactionFieldShift;
 330     const real gmx_unused shLjEwald       = ic.sh_lj_ewald;
 331     const real            rVdw            = ic.rvdw;
 332     const real            dispersionShift = ic.dispersion_shift.cpot;
 333     const real            repulsionShift  = ic.repulsion_shift.cpot;
 334     const real            ewaldBeta       = ic.ewaldcoeff_q;
 335     real gmx_unused       ewaldLJCoeffSq;
 336     real gmx_unused       ewaldLJCoeffSixDivSix;
 337     if constexpr (vdwInteractionTypeIsEwald)
 338     {
 339         ewaldLJCoeffSq        = ic.ewaldcoeff_lj * ic.ewaldcoeff_lj;
 340         ewaldLJCoeffSixDivSix = ewaldLJCoeffSq * ewaldLJCoeffSq * ewaldLJCoeffSq / six;
 341     }
 342
 343     // Note that the nbnxm kernels do not support Coulomb potential switching at all
 344     GMX_ASSERT(ic.coulomb_modifier != InteractionModifiers::PotSwitch,
 345                "Potential switching is not supported for Coulomb with FEP");
 346
 347     const real      rVdwSwitch = ic.rvdw_switch;
 348     real gmx_unused vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
 349     if constexpr (vdwModifierIsPotSwitch)
 350     {
 351         const real d = rVdw - rVdwSwitch;
 352         vdw_swV3     = -10.0_real / (d * d * d);
 353         vdw_swV4     = 15.0_real / (d * d * d * d);
 354         vdw_swV5     = -6.0_real / (d * d * d * d * d);
 355         vdw_swF2     = -30.0_real / (d * d * d);
 356         vdw_swF3     = 60.0_real / (d * d * d * d);
 357         vdw_swF4     = -30.0_real / (d * d * d * d * d);
 358     }
 359     else
 360     {
 361         /* Avoid warnings from stupid compilers (looking at you, Clang!) */
 362         vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = zero;
 363     }
 364
 365     NbkernelElecType icoul;
 366     if (ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype))
 367     {
 368         icoul = NbkernelElecType::ReactionField;
 369     }
 370     else
 371     {
 372         icoul = NbkernelElecType::None;
 373     }
 374
 375     real rcutoff_max2                 = std::max(ic.rcoulomb, ic.rvdw);
 376     rcutoff_max2                      = rcutoff_max2 * rcutoff_max2;
 377     const real gmx_unused rCutoffCoul = ic.rcoulomb;
 378
 379     real gmx_unused sh_ewald = 0;
 380     if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
 381     {
 382         sh_ewald = ic.sh_ewald;
 383     }
 384
 385     /* For Ewald/PME interactions we cannot easily apply the soft-core component to
 386      * reciprocal space. When we use non-switched Ewald interactions, we
 387      * assume the soft-coring does not significantly affect the grid contribution
 388      * and apply the soft-core only to the full 1/r (- shift) pair contribution.
 389      *
 390      * However, we cannot use this approach for switch-modified since we would then
 391      * effectively end up evaluating a significantly different interaction here compared to the
 392      * normal (non-free-energy) kernels, either by applying a cutoff at a different
 393      * position than what the user requested, or by switching different
 394      * things (1/r rather than short-range Ewald). For these settings, we just
 395      * use the traditional short-range Ewald interaction in that case.
 396      */
 397     GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
 398                        "Can not apply soft-core to switched Ewald potentials");
 399
 400     const RealType            minDistanceSquared(c_minDistanceSquared);
 401     const RealType            maxRInvSix(c_maxRInvSix);
 402     const RealType gmx_unused floatMin(GMX_FLOAT_MIN);
 403
 404     RealType dvdlCoul{ zero };
 405     RealType dvdlVdw{ zero };
 406
 407     /* Lambda factor for state A, 1-lambda*/
 408     real LFC[NSTATES], LFV[NSTATES];
 409     LFC[STATE_A] = one - lambda_coul;
 410     LFV[STATE_A] = one - lambda_vdw;
 411
 412     /* Lambda factor for state B, lambda*/
 413     LFC[STATE_B] = lambda_coul;
 414     LFV[STATE_B] = lambda_vdw;
 415
 416     /*derivative of the lambda factor for state A and B */
 417     real DLF[NSTATES];
 418     DLF[STATE_A] = -one;
 419     DLF[STATE_B] = one;
 420
 421     real gmx_unused lFacCoul[NSTATES], dlFacCoul[NSTATES], lFacVdw[NSTATES], dlFacVdw[NSTATES];
 422     constexpr real  sc_r_power = six;
 423     for (int i = 0; i < NSTATES; i++)
 424     {
 425         lFacCoul[i]  = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i]));
 426         dlFacCoul[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFC[i]) : 1);
 427         lFacVdw[i]   = (lam_power == 2 ? (1 - LFV[i]) * (1 - LFV[i]) : (1 - LFV[i]));
 428         dlFacVdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
 429     }
 430
 431     // We need pointers to real for SIMD access
 432     const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
 433     real* gmx_restrict       forceRealPtr;
 434     if constexpr (computeForces)
 435     {
 436         GMX_ASSERT(nri == 0 || !threadForceBuffer.empty(), "need a valid threadForceBuffer");
 437
 438         forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
 439
 440         if (doShiftForces)
 441         {
 442             GMX_ASSERT(threadForceShiftBuffer != nullptr, "need a valid threadForceShiftBuffer");
 443         }
 444     }
 445
 446     const real rlistSquared = gmx::square(rlist);
 447
 448     bool haveExcludedPairsBeyondRlist = false;
 449
 450     for (int n = 0; n < nri; n++)
 451     {
 452         bool havePairsWithinCutoff = false;
 453
 454         const int  is   = shift[n];
 455         const real shX  = shiftvec[is][XX];
 456         const real shY  = shiftvec[is][YY];
 457         const real shZ  = shiftvec[is][ZZ];
 458         const int  nj0  = jindex[n];
 459         const int  nj1  = jindex[n + 1];
 460         const int  ii   = iinr[n];
 461         const int  ii3  = 3 * ii;
 462         const real ix   = shX + x[ii3 + 0];
 463         const real iy   = shY + x[ii3 + 1];
 464         const real iz   = shZ + x[ii3 + 2];
 465         const real iqA  = facel * chargeA[ii];
 466         const real iqB  = facel * chargeB[ii];
 467         const int  ntiA = ntype * typeA[ii];
 468         const int  ntiB = ntype * typeB[ii];
 469         RealType   vCTot(0);
 470         RealType   vVTot(0);
 471         RealType   fIX(0);
 472         RealType   fIY(0);
 473         RealType   fIZ(0);
 474
 475 #if GMX_SIMD_HAVE_REAL
 476         alignas(GMX_SIMD_ALIGNMENT) int            preloadIi[DataTypes::simdRealWidth];
 477         alignas(GMX_SIMD_ALIGNMENT) int gmx_unused preloadIs[DataTypes::simdRealWidth];
 478 #else
 479         int            preloadIi[DataTypes::simdRealWidth];
 480         int gmx_unused preloadIs[DataTypes::simdRealWidth];
 481 #endif
 482         for (int i = 0; i < DataTypes::simdRealWidth; i++)
 483         {
 484             preloadIi[i] = ii;
 485             preloadIs[i] = shift[n];
 486         }
 487         IntType ii_s = gmx::load<IntType>(preloadIi);
 488
 489         for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
 490         {
 491             RealType r, rInv;
 492
 493 #if GMX_SIMD_HAVE_REAL
 494             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIsValid[DataTypes::simdRealWidth];
 495             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIncluded[DataTypes::simdRealWidth];
 496             alignas(GMX_SIMD_ALIGNMENT) int32_t preloadJnr[DataTypes::simdRealWidth];
 497             alignas(GMX_SIMD_ALIGNMENT) int32_t typeIndices[NSTATES][DataTypes::simdRealWidth];
 498             alignas(GMX_SIMD_ALIGNMENT) real    preloadQq[NSTATES][DataTypes::simdRealWidth];
 499             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
 500             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
 501             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
 502             alignas(GMX_SIMD_ALIGNMENT)
 503                     real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
 504             alignas(GMX_SIMD_ALIGNMENT)
 505                     real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
 506             alignas(GMX_SIMD_ALIGNMENT)
 507                     real gmx_unused preloadGapsysSigma6VdW[NSTATES][DataTypes::simdRealWidth];
 508             alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
 509 #else
 510             real            preloadPairIsValid[DataTypes::simdRealWidth];
 511             real            preloadPairIncluded[DataTypes::simdRealWidth];
 512             int             preloadJnr[DataTypes::simdRealWidth];
 513             int             typeIndices[NSTATES][DataTypes::simdRealWidth];
 514             real            preloadQq[NSTATES][DataTypes::simdRealWidth];
 515             real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
 516             real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
 517             real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
 518             real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
 519             real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
 520             real gmx_unused preloadGapsysSigma6VdW[NSTATES][DataTypes::simdRealWidth];
 521             real            preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
 522 #endif
 523             for (int j = 0; j < DataTypes::simdRealWidth; j++)
 524             {
 525                 if (k + j < nj1)
 526                 {
 527                     preloadPairIsValid[j] = true;
 528                     /* Check if this pair on the exclusions list.*/
 529                     preloadPairIncluded[j]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + j]);
 530                     const int jnr           = jjnr[k + j];
 531                     preloadJnr[j]           = jnr;
 532                     typeIndices[STATE_A][j] = ntiA + typeA[jnr];
 533                     typeIndices[STATE_B][j] = ntiB + typeB[jnr];
 534                     preloadQq[STATE_A][j]   = iqA * chargeA[jnr];
 535                     preloadQq[STATE_B][j]   = iqB * chargeB[jnr];
 536
 537                     for (int i = 0; i < NSTATES; i++)
 538                     {
 539                         if constexpr (vdwInteractionTypeIsEwald)
 540                         {
 541                             preloadLjPmeC6Grid[i][j] = nbfp_grid[2 * typeIndices[i][j]];
 542                         }
 543                         else
 544                         {
 545                             preloadLjPmeC6Grid[i][j] = 0;
 546                         }
 547                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 548                         {
 549                             const real c6  = nbfp[2 * typeIndices[i][j]];
 550                             const real c12 = nbfp[2 * typeIndices[i][j] + 1];
 551                             if (c6 > 0 && c12 > 0)
 552                             {
 553                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
 554                                 preloadSigma6[i][j] = 0.5_real * c12 / c6;
 555                                 if (preloadSigma6[i][j]
 556                                     < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
 557                                 {
 558                                     preloadSigma6[i][j] = sigma6_min;
 559                                 }
 560                             }
 561                             else
 562                             {
 563                                 preloadSigma6[i][j] = sigma6_def;
 564                             }
 565                         }
 566                         if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 567                         {
 568                             const real c6  = nbfp[2 * typeIndices[i][j]];
 569                             const real c12 = nbfp[2 * typeIndices[i][j] + 1];
 570                             if (c6 > 0 && c12 > 0)
 571                             {
 572                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
 573                                 preloadGapsysSigma6VdW[i][j] = 0.5_real * c12 / c6;
 574                             }
 575                             else
 576                             {
 577                                 preloadGapsysSigma6VdW[i][j] = gapsysSigma6VdW;
 578                             }
 579                         }
 580                     }
 581                     if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 582                     {
 583                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
 584                         const real c12A = nbfp[2 * typeIndices[STATE_A][j] + 1];
 585                         const real c12B = nbfp[2 * typeIndices[STATE_B][j] + 1];
 586                         if (c12A > 0 && c12B > 0)
 587                         {
 588                             preloadAlphaVdwEff[j]  = 0;
 589                             preloadAlphaCoulEff[j] = 0;
 590                         }
 591                         else
 592                         {
 593                             preloadAlphaVdwEff[j]  = alpha_vdw;
 594                             preloadAlphaCoulEff[j] = alpha_coul;
 595                         }
 596                     }
 597                     if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 598                     {
 599                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
 600                         const real c12A = nbfp[2 * typeIndices[STATE_A][j] + 1];
 601                         const real c12B = nbfp[2 * typeIndices[STATE_B][j] + 1];
 602                         if (c12A > 0 && c12B > 0)
 603                         {
 604                             preloadGapsysScaleLinpointVdW[j]  = 0;
 605                             preloadGapsysScaleLinpointCoul[j] = 0;
 606                         }
 607                         else
 608                         {
 609                             preloadGapsysScaleLinpointVdW[j]  = gapsysScaleLinpointVdW;
 610                             preloadGapsysScaleLinpointCoul[j] = gapsysScaleLinpointCoul;
 611                         }
 612                     }
 613                 }
 614                 else
 615                 {
 616                     preloadJnr[j]                     = jjnr[k];
 617                     preloadPairIsValid[j]             = false;
 618                     preloadPairIncluded[j]            = false;
 619                     preloadAlphaVdwEff[j]             = 0;
 620                     preloadAlphaCoulEff[j]            = 0;
 621                     preloadGapsysScaleLinpointVdW[j]  = 0;
 622                     preloadGapsysScaleLinpointCoul[j] = 0;
 623
 624                     typeIndices[STATE_A][j] = ntiA + typeA[jjnr[k]];
 625                     typeIndices[STATE_B][j] = ntiB + typeB[jjnr[k]];
 626                     for (int i = 0; i < NSTATES; i++)
 627                     {
 628                         preloadLjPmeC6Grid[i][j]     = 0;
 629                         preloadQq[i][j]              = 0;
 630                         preloadSigma6[i][j]          = 0;
 631                         preloadGapsysSigma6VdW[i][j] = 0;
 632                     }
 633                 }
 634             }
 635
 636             RealType jx, jy, jz;
 637             gmx::gatherLoadUTranspose<3>(reinterpret_cast<const real*>(x), preloadJnr, &jx, &jy, &jz);
 638
 639             const RealType pairIsValid   = gmx::load<RealType>(preloadPairIsValid);
 640             const RealType pairIncluded  = gmx::load<RealType>(preloadPairIncluded);
 641             const BoolType bPairIncluded = (pairIncluded != zero);
 642             const BoolType bPairExcluded = (pairIncluded == zero && pairIsValid != zero);
 643
 644             const RealType dX  = ix - jx;
 645             const RealType dY  = iy - jy;
 646             const RealType dZ  = iz - jz;
 647             RealType       rSq = dX * dX + dY * dY + dZ * dZ;
 648
 649             BoolType withinCutoffMask = (rSq < rcutoff_max2);
 650
 651             if (!gmx::anyTrue(withinCutoffMask || bPairExcluded))
 652             {
 653                 /* We save significant time by skipping all code below.
 654                  * Note that with soft-core interactions, the actual cut-off
 655                  * check might be different. But since the soft-core distance
 656                  * is always larger than r, checking on r here is safe.
 657                  * Exclusions outside the cutoff can not be skipped as
 658                  * when using Ewald: the reciprocal-space
 659                  * Ewald component still needs to be subtracted.
 660                  */
 661                 continue;
 662             }
 663             else
 664             {
 665                 havePairsWithinCutoff = true;
 666             }
 667
 668             if (gmx::anyTrue(rlistSquared < rSq && bPairExcluded))
 669             {
 670                 haveExcludedPairsBeyondRlist = true;
 671             }
 672
 673             const IntType  jnr_s    = gmx::load<IntType>(preloadJnr);
 674             const BoolType bIiEqJnr = gmx::cvtIB2B(ii_s == jnr_s);
 675
 676             RealType            c6[NSTATES];
 677             RealType            c12[NSTATES];
 678             RealType gmx_unused sigma6[NSTATES];
 679             RealType            qq[NSTATES];
 680             RealType gmx_unused ljPmeC6Grid[NSTATES];
 681             RealType gmx_unused alphaVdwEff;
 682             RealType gmx_unused alphaCoulEff;
 683             RealType gmx_unused gapsysScaleLinpointVdWEff;
 684             RealType gmx_unused gapsysScaleLinpointCoulEff;
 685             RealType gmx_unused gapsysSigma6VdWEff[NSTATES];
 686             for (int i = 0; i < NSTATES; i++)
 687             {
 688                 gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]);
 689                 qq[i]          = gmx::load<RealType>(preloadQq[i]);
 690                 ljPmeC6Grid[i] = gmx::load<RealType>(preloadLjPmeC6Grid[i]);
 691                 if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 692                 {
 693                     sigma6[i] = gmx::load<RealType>(preloadSigma6[i]);
 694                 }
 695                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 696                 {
 697                     gapsysSigma6VdWEff[i] = gmx::load<RealType>(preloadGapsysSigma6VdW[i]);
 698                 }
 699             }
 700             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 701             {
 702                 alphaVdwEff  = gmx::load<RealType>(preloadAlphaVdwEff);
 703                 alphaCoulEff = gmx::load<RealType>(preloadAlphaCoulEff);
 704             }
 705             if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 706             {
 707                 gapsysScaleLinpointVdWEff  = gmx::load<RealType>(preloadGapsysScaleLinpointVdW);
 708                 gapsysScaleLinpointCoulEff = gmx::load<RealType>(preloadGapsysScaleLinpointCoul);
 709             }
 710
 711             // Avoid overflow of r^-12 at distances near zero
 712             rSq  = gmx::max(rSq, minDistanceSquared);
 713             rInv = gmx::invsqrt(rSq);
 714             r    = rSq * rInv;
 715
 716             RealType gmx_unused rp, rpm2;
 717             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 718             {
 719                 rpm2 = rSq * rSq;  /* r4 */
 720                 rp   = rpm2 * rSq; /* r6 */
 721             }
 722             else
 723             {
 724                 /* The soft-core power p will not affect the results
 725                  * with not using soft-core, so we use power of 0 which gives
 726                  * the simplest math and cheapest code.
 727                  */
 728                 rpm2 = rInv * rInv;
 729                 rp   = one;
 730             }
 731
 732             RealType fScal(0);
 733
 734             /* The following block is masked to only calculate values having bPairIncluded. If
 735              * bPairIncluded is true then withinCutoffMask must also be true. */
 736             if (gmx::anyTrue(withinCutoffMask && bPairIncluded))
 737             {
 738                 RealType gmx_unused fScalC[NSTATES], fScalV[NSTATES];
 739                 RealType            vCoul[NSTATES], vVdw[NSTATES];
 740                 for (int i = 0; i < NSTATES; i++)
 741                 {
 742                     fScalC[i] = zero;
 743                     fScalV[i] = zero;
 744                     vCoul[i]  = zero;
 745                     vVdw[i]   = zero;
 746
 747                     RealType gmx_unused rInvC, rInvV, rC, rV, rPInvC, rPInvV;
 748
 749                     /* The following block is masked to require (qq[i] != 0 || c6[i] != 0 || c12[i]
 750                      * != 0) in addition to bPairIncluded, which in turn requires withinCutoffMask. */
 751                     BoolType nonZeroState = ((qq[i] != zero || c6[i] != zero || c12[i] != zero)
 752                                              && bPairIncluded && withinCutoffMask);
 753                     if (gmx::anyTrue(nonZeroState))
 754                     {
 755                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 756                         {
 757                             RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
 758                             rPInvC           = gmx::inv(divisor);
 759                             sixthRoot(rPInvC, &rInvC, &rC);
 760
 761                             if constexpr (scLambdasOrAlphasDiffer)
 762                             {
 763                                 RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
 764                                 rPInvV           = gmx::inv(divisor);
 765                                 sixthRoot(rPInvV, &rInvV, &rV);
 766                             }
 767                             else
 768                             {
 769                                 /* We can avoid one expensive pow and one / operation */
 770                                 rPInvV = rPInvC;
 771                                 rInvV  = rInvC;
 772                                 rV     = rC;
 773                             }
 774                         }
 775                         else
 776                         {
 777                             rPInvC = one;
 778                             rInvC  = rInv;
 779                             rC     = r;
 780
 781                             rPInvV = one;
 782                             rInvV  = rInv;
 783                             rV     = r;
 784                         }
 785
 786                         /* Only process the coulomb interactions if we either
 787                          * include all entries in the list (no cutoff
 788                          * used in the kernel), or if we are within the cutoff.
 789                          */
 790                         BoolType computeElecInteraction;
 791                         if constexpr (elecInteractionTypeIsEwald)
 792                         {
 793                             computeElecInteraction = (r < rCoulomb && qq[i] != zero && bPairIncluded);
 794                         }
 795                         else
 796                         {
 797                             computeElecInteraction = (rC < rCoulomb && qq[i] != zero && bPairIncluded);
 798                         }
 799                         if (gmx::anyTrue(computeElecInteraction))
 800                         {
 801                             if constexpr (elecInteractionTypeIsEwald)
 802                             {
 803                                 vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald);
 804                                 if constexpr (computeForces)
 805                                 {
 806                                     fScalC[i] = ewaldScalarForce(qq[i], rInvC);
 807                                 }
 808
 809                                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 810                                 {
 811                                     ewaldQuadraticPotential<computeForces>(qq[i],
 812                                                                            facel,
 813                                                                            rC,
 814                                                                            rCutoffCoul,
 815                                                                            LFC[i],
 816                                                                            DLF[i],
 817                                                                            gapsysScaleLinpointCoulEff,
 818                                                                            sh_ewald,
 819                                                                            &fScalC[i],
 820                                                                            &vCoul[i],
 821                                                                            &dvdlCoul,
 822                                                                            computeElecInteraction);
 823                                 }
 824                             }
 825                             else
 826                             {
 827                                 vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
 828                                 if constexpr (computeForces)
 829                                 {
 830                                     fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
 831                                 }
 832
 833                                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 834                                 {
 835                                     reactionFieldQuadraticPotential<computeForces>(
 836                                             qq[i],
 837                                             facel,
 838                                             rC,
 839                                             rCutoffCoul,
 840                                             LFC[i],
 841                                             DLF[i],
 842                                             gapsysScaleLinpointCoulEff,
 843                                             krf,
 844                                             crf,
 845                                             &fScalC[i],
 846                                             &vCoul[i],
 847                                             &dvdlCoul,
 848                                             computeElecInteraction);
 849                                 }
 850                             }
 851
 852                             vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction);
 853                             if constexpr (computeForces)
 854                             {
 855                                 fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
 856                             }
 857                         }
 858
 859                         /* Only process the VDW interactions if we either
 860                          * include all entries in the list (no cutoff used
 861                          * in the kernel), or if we are within the cutoff.
 862                          */
 863                         BoolType computeVdwInteraction;
 864                         if constexpr (vdwInteractionTypeIsEwald)
 865                         {
 866                             computeVdwInteraction =
 867                                     (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
 868                         }
 869                         else
 870                         {
 871                             computeVdwInteraction =
 872                                     (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
 873                         }
 874                         if (gmx::anyTrue(computeVdwInteraction))
 875                         {
 876                             RealType rInv6;
 877                             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 878                             {
 879                                 rInv6 = rPInvV;
 880                             }
 881                             else
 882                             {
 883                                 rInv6 = calculateRinv6(rInvV);
 884                             }
 885                             // Avoid overflow at short distance for masked exclusions and
 886                             // for foreign energy calculations at a hard core end state.
 887                             // Note that we should limit r^-6, and thus also r^-12, and
 888                             // not only r^-12, as that could lead to erroneously low instead
 889                             // of very high foreign energies.
 890                             rInv6           = gmx::min(rInv6, maxRInvSix);
 891                             RealType vVdw6  = calculateVdw6(c6[i], rInv6);
 892                             RealType vVdw12 = calculateVdw12(c12[i], rInv6);
 893
 894                             vVdw[i] = lennardJonesPotential(
 895                                     vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth);
 896                             if constexpr (computeForces)
 897                             {
 898                                 fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
 899                             }
 900
 901                             if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 902                             {
 903                                 lennardJonesQuadraticPotential<computeForces>(c6[i],
 904                                                                               c12[i],
 905                                                                               r,
 906                                                                               rSq,
 907                                                                               LFV[i],
 908                                                                               DLF[i],
 909                                                                               gapsysSigma6VdWEff[i],
 910                                                                               gapsysScaleLinpointVdWEff,
 911                                                                               repulsionShift,
 912                                                                               dispersionShift,
 913                                                                               &fScalV[i],
 914                                                                               &vVdw[i],
 915                                                                               &dvdlVdw,
 916                                                                               computeVdwInteraction);
 917                             }
 918
 919                             if constexpr (vdwInteractionTypeIsEwald)
 920                             {
 921                                 /* Subtract the grid potential at the cut-off */
 922                                 vVdw[i] = vVdw[i]
 923                                           + gmx::selectByMask(ewaldLennardJonesGridSubtract(
 924                                                                       ljPmeC6Grid[i], shLjEwald, oneSixth),
 925                                                               computeVdwInteraction);
 926                             }
 927
 928                             if constexpr (vdwModifierIsPotSwitch)
 929                             {
 930                                 RealType d             = rV - rVdwSwitch;
 931                                 BoolType zeroMask      = zero < d;
 932                                 BoolType potSwitchMask = rV < rVdw;
 933                                 d                      = gmx::selectByMask(d, zeroMask);
 934                                 const RealType d2      = d * d;
 935                                 const RealType sw =
 936                                         one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
 937
 938                                 if constexpr (computeForces)
 939                                 {
 940                                     const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
 941                                     fScalV[i]          = potSwitchScalarForceMod(
 942                                             fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
 943                                 }
 944                                 vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask);
 945                             }
 946
 947                             vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction);
 948                             if constexpr (computeForces)
 949                             {
 950                                 fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
 951                             }
 952                         }
 953
 954                         if constexpr (computeForces)
 955                         {
 956                             /* fScalC (and fScalV) now contain: dV/drC * rC
 957                              * Now we multiply by rC^-6, so it will be: dV/drC * rC^-5
 958                              * Further down we first multiply by r^4 and then by
 959                              * the vector r, which in total gives: dV/drC * (r/rC)^-5
 960                              */
 961                             fScalC[i] = fScalC[i] * rPInvC;
 962                             fScalV[i] = fScalV[i] * rPInvV;
 963                         }
 964                     } // end of block requiring nonZeroState
 965                 }     // end for (int i = 0; i < NSTATES; i++)
 966
 967                 /* Assemble A and B states. */
 968                 BoolType assembleStates = (bPairIncluded && withinCutoffMask);
 969                 if (gmx::anyTrue(assembleStates))
 970                 {
 971                     for (int i = 0; i < NSTATES; i++)
 972                     {
 973                         vCTot = vCTot + LFC[i] * vCoul[i];
 974                         vVTot = vVTot + LFV[i] * vVdw[i];
 975
 976                         if constexpr (computeForces)
 977                         {
 978                             fScal = fScal + LFC[i] * fScalC[i] * rpm2;
 979                             fScal = fScal + LFV[i] * fScalV[i] * rpm2;
 980                         }
 981
 982                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 983                         {
 984                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]
 985                                        + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i];
 986                             dvdlVdw = dvdlVdw + vVdw[i] * DLF[i]
 987                                       + LFV[i] * alphaVdwEff * dlFacVdw[i] * fScalV[i] * sigma6[i];
 988                         }
 989                         else
 990                         {
 991                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i];
 992                             dvdlVdw  = dvdlVdw + vVdw[i] * DLF[i];
 993                         }
 994                     }
 995                 }
 996             } // end of block requiring bPairIncluded && withinCutoffMask
 997             /* In the following block bPairIncluded should be false in the masks. */
 998             if (icoul == NbkernelElecType::ReactionField)
 999             {
1000                 const BoolType computeReactionField = bPairExcluded;
1001
1002                 if (gmx::anyTrue(computeReactionField))
1003                 {
1004                     /* For excluded pairs we don't use soft-core.
1005                      * As there is no singularity, there is no need for soft-core.
1006                      */
1007                     const RealType FF = -two * krf;
1008                     RealType       VV = krf * rSq - crf;
1009
1010                     /* If ii == jnr the i particle (ii) has itself (jnr)
1011                      * in its neighborlist. This corresponds to a self-interaction
1012                      * that will occur twice. Scale it down by 50% to only include
1013                      * it once.
1014                      */
1015                     VV = VV * gmx::blend(one, half, bIiEqJnr);
1016
1017                     for (int i = 0; i < NSTATES; i++)
1018                     {
1019                         vCTot = vCTot + gmx::selectByMask(LFC[i] * qq[i] * VV, computeReactionField);
1020                         fScal = fScal + gmx::selectByMask(LFC[i] * qq[i] * FF, computeReactionField);
1021                         dvdlCoul = dvdlCoul + gmx::selectByMask(DLF[i] * qq[i] * VV, computeReactionField);
1022                     }
1023                 }
1024             }
1025
1026             const BoolType computeElecEwaldInteraction = (bPairExcluded || r < rCoulomb);
1027             if (elecInteractionTypeIsEwald && gmx::anyTrue(computeElecEwaldInteraction))
1028             {
1029                 /* See comment in the preamble. When using Ewald interactions
1030                  * (unless we use a switch modifier) we subtract the reciprocal-space
1031                  * Ewald component here which made it possible to apply the free
1032                  * energy interaction to 1/r (vanilla coulomb short-range part)
1033                  * above. This gets us closer to the ideal case of applying
1034                  * the softcore to the entire electrostatic interaction,
1035                  * including the reciprocal-space component.
1036                  */
1037                 RealType v_lr, f_lr;
1038
1039                 pmeCoulombCorrectionVF<computeForces>(rSq, ewaldBeta, &v_lr, &f_lr);
1040                 if constexpr (computeForces)
1041                 {
1042                     f_lr = f_lr * rInv * rInv;
1043                 }
1044
1045                 /* Note that any possible Ewald shift has already been applied in
1046                  * the normal interaction part above.
1047                  */
1048
1049                 /* If ii == jnr the i particle (ii) has itself (jnr)
1050                  * in its neighborlist. This corresponds to a self-interaction
1051                  * that will occur twice. Scale it down by 50% to only include
1052                  * it once.
1053                  */
1054                 v_lr = v_lr * gmx::blend(one, half, bIiEqJnr);
1055
1056                 for (int i = 0; i < NSTATES; i++)
1057                 {
1058                     vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1059                     if constexpr (computeForces)
1060                     {
1061                         fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
1062                     }
1063                     dvdlCoul = dvdlCoul
1064                                - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1065                 }
1066             }
1067
1068             const BoolType computeVdwEwaldInteraction = (bPairExcluded || r < rVdw);
1069             if (vdwInteractionTypeIsEwald && gmx::anyTrue(computeVdwEwaldInteraction))
1070             {
1071                 /* See comment in the preamble. When using LJ-Ewald interactions
1072                  * (unless we use a switch modifier) we subtract the reciprocal-space
1073                  * Ewald component here which made it possible to apply the free
1074                  * energy interaction to r^-6 (vanilla LJ6 short-range part)
1075                  * above. This gets us closer to the ideal case of applying
1076                  * the softcore to the entire VdW interaction,
1077                  * including the reciprocal-space component.
1078                  */
1079
1080                 RealType v_lr, f_lr;
1081                 pmeLJCorrectionVF<computeForces>(
1082                         rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr);
1083                 v_lr = v_lr * oneSixth;
1084
1085                 for (int i = 0; i < NSTATES; i++)
1086                 {
1087                     vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1088                     if constexpr (computeForces)
1089                     {
1090                         fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
1091                     }
1092                     dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1093                 }
1094             }
1095
1096             if (computeForces && gmx::anyTrue(fScal != zero))
1097             {
1098                 const RealType tX = fScal * dX;
1099                 const RealType tY = fScal * dY;
1100                 const RealType tZ = fScal * dZ;
1101                 fIX               = fIX + tX;
1102                 fIY               = fIY + tY;
1103                 fIZ               = fIZ + tZ;
1104
1105                 gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
1106             }
1107         } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
1108
1109         if (havePairsWithinCutoff)
1110         {
1111             if constexpr (computeForces)
1112             {
1113                 gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
1114
1115                 if (doShiftForces)
1116                 {
1117                     gmx::transposeScatterIncrU<3>(
1118                             reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
1119                 }
1120             }
1121             if (doPotential)
1122             {
1123                 int ggid = gid[n];
1124                 threadVc[ggid] += gmx::reduce(vCTot);
1125                 threadVv[ggid] += gmx::reduce(vVTot);
1126             }
1127         }
1128     } // end for (int n = 0; n < nri; n++)
1129
1130     if (gmx::anyTrue(dvdlCoul != zero))
1131     {
1132         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)] += gmx::reduce(dvdlCoul);
1133     }
1134     if (gmx::anyTrue(dvdlVdw != zero))
1135     {
1136         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)] += gmx::reduce(dvdlVdw);
1137     }
1138
1139     /* Estimate flops, average for free energy stuff:
1140      * 12  flops per outer iteration
1141      * 150 flops per inner iteration
1142      * TODO: Update the number of flops and/or use different counts for different code paths.
1143      */
1144     atomicNrnbIncrement(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist.nri * 12 + nlist.jindex[nri] * 150);
1145
1146     if (haveExcludedPairsBeyondRlist > 0)
1147     {
1148         gmx_fatal(FARGS,
1149                   "There are perturbed non-bonded pair interactions beyond the pair-list cutoff "
1150                   "of %g nm, which is not supported. This can happen because the system is "
1151                   "unstable or because intra-molecular interactions at long distances are "
1152                   "excluded. If the "
1153                   "latter is the case, you can try to increase nstlist or rlist to avoid this."
1154                   "The error is likely triggered by the use of couple-intramol=no "
1155                   "and the maximal distance in the decoupled molecule exceeding rlist.",
1156                   rlist);
1157     }
1158 }
1159
1160 typedef void (*KernelFunction)(const t_nblist&                                  nlist,
1161                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1162                                const int                                        ntype,
1163                                const real                                       rlist,
1164                                const interaction_const_t&                       ic,
1165                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1166                                gmx::ArrayRef<const real>                        nbfp,
1167                                gmx::ArrayRef<const real>                        nbfp_grid,
1168                                gmx::ArrayRef<const real>                        chargeA,
1169                                gmx::ArrayRef<const real>                        chargeB,
1170                                gmx::ArrayRef<const int>                         typeA,
1171                                gmx::ArrayRef<const int>                         typeB,
1172                                int                                              flags,
1173                                gmx::ArrayRef<const real>                        lambda,
1174                                t_nrnb* gmx_restrict                             nrnb,
1175                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1176                                rvec*               threadForceShiftBuffer,
1177                                gmx::ArrayRef<real> threadVc,
1178                                gmx::ArrayRef<real> threadVv,
1179                                gmx::ArrayRef<real> threadDvdl);
1180
1181 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
1182 static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
1183 {
1184     if (useSimd)
1185     {
1186 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
1187         return (nb_free_energy_kernel<SimdDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1188 #else
1189         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1190 #endif
1191     }
1192     else
1193     {
1194         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1195     }
1196 }
1197
1198 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
1199 static KernelFunction dispatchKernelOnComputeForces(const bool computeForces, const bool useSimd)
1200 {
1201     if (computeForces)
1202     {
1203         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, true>(
1204                 useSimd));
1205     }
1206     else
1207     {
1208         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, false>(
1209                 useSimd));
1210     }
1211 }
1212
1213 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
1214 static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch,
1215                                                   const bool computeForces,
1216                                                   const bool useSimd)
1217 {
1218     if (vdwModifierIsPotSwitch)
1219     {
1220         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
1221                 computeForces, useSimd));
1222     }
1223     else
1224     {
1225         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
1226                 computeForces, useSimd));
1227     }
1228 }
1229
1230 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
1231 static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
1232                                                           const bool vdwModifierIsPotSwitch,
1233                                                           const bool computeForces,
1234                                                           const bool useSimd)
1235 {
1236     if (elecInteractionTypeIsEwald)
1237     {
1238         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
1239                 vdwModifierIsPotSwitch, computeForces, useSimd));
1240     }
1241     else
1242     {
1243         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
1244                 vdwModifierIsPotSwitch, computeForces, useSimd));
1245     }
1246 }
1247
1248 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer>
1249 static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
1250                                                          const bool elecInteractionTypeIsEwald,
1251                                                          const bool vdwModifierIsPotSwitch,
1252                                                          const bool computeForces,
1253                                                          const bool useSimd)
1254 {
1255     if (vdwInteractionTypeIsEwald)
1256     {
1257         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, true>(
1258                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1259     }
1260     else
1261     {
1262         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, false>(
1263                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1264     }
1265 }
1266
1267 template<KernelSoftcoreType softcoreType>
1268 static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
1269                                                                   const bool vdwInteractionTypeIsEwald,
1270                                                                   const bool elecInteractionTypeIsEwald,
1271                                                                   const bool vdwModifierIsPotSwitch,
1272                                                                   const bool computeForces,
1273                                                                   const bool useSimd)
1274 {
1275     if (scLambdasOrAlphasDiffer)
1276     {
1277         return (dispatchKernelOnVdwInteractionType<softcoreType, true>(
1278                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1279     }
1280     else
1281     {
1282         return (dispatchKernelOnVdwInteractionType<softcoreType, false>(
1283                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1284     }
1285 }
1286
1287 static KernelFunction dispatchKernel(const bool                 scLambdasOrAlphasDiffer,
1288                                      const bool                 vdwInteractionTypeIsEwald,
1289                                      const bool                 elecInteractionTypeIsEwald,
1290                                      const bool                 vdwModifierIsPotSwitch,
1291                                      const bool                 computeForces,
1292                                      const bool                 useSimd,
1293                                      const interaction_const_t& ic)
1294 {
1295     const auto& scParams = *ic.softCoreParameters;
1296     if (scParams.softcoreType == SoftcoreType::Beutler)
1297     {
1298         if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
1299         {
1300             return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
1301                     scLambdasOrAlphasDiffer,
1302                     vdwInteractionTypeIsEwald,
1303                     elecInteractionTypeIsEwald,
1304                     vdwModifierIsPotSwitch,
1305                     computeForces,
1306                     useSimd));
1307         }
1308         return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Beutler>(
1309                 scLambdasOrAlphasDiffer,
1310                 vdwInteractionTypeIsEwald,
1311                 elecInteractionTypeIsEwald,
1312                 vdwModifierIsPotSwitch,
1313                 computeForces,
1314                 useSimd));
1315     }
1316     else // Gapsys
1317     {
1318         if (scParams.gapsysScaleLinpointCoul == 0 && scParams.gapsysScaleLinpointVdW == 0)
1319         {
1320             return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
1321                     scLambdasOrAlphasDiffer,
1322                     vdwInteractionTypeIsEwald,
1323                     elecInteractionTypeIsEwald,
1324                     vdwModifierIsPotSwitch,
1325                     computeForces,
1326                     useSimd));
1327         }
1328         return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Gapsys>(
1329                 scLambdasOrAlphasDiffer,
1330                 vdwInteractionTypeIsEwald,
1331                 elecInteractionTypeIsEwald,
1332                 vdwModifierIsPotSwitch,
1333                 computeForces,
1334                 useSimd));
1335     }
1336 }
1337
1338
1339 void gmx_nb_free_energy_kernel(const t_nblist&                                  nlist,
1340                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1341                                const bool                                       useSimd,
1342                                const int                                        ntype,
1343                                const real                                       rlist,
1344                                const interaction_const_t&                       ic,
1345                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1346                                gmx::ArrayRef<const real>                        nbfp,
1347                                gmx::ArrayRef<const real>                        nbfp_grid,
1348                                gmx::ArrayRef<const real>                        chargeA,
1349                                gmx::ArrayRef<const real>                        chargeB,
1350                                gmx::ArrayRef<const int>                         typeA,
1351                                gmx::ArrayRef<const int>                         typeB,
1352                                int                                              flags,
1353                                gmx::ArrayRef<const real>                        lambda,
1354                                t_nrnb*                                          nrnb,
1355                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1356                                rvec*               threadForceShiftBuffer,
1357                                gmx::ArrayRef<real> threadVc,
1358                                gmx::ArrayRef<real> threadVv,
1359                                gmx::ArrayRef<real> threadDvdl)
1360 {
1361     GMX_ASSERT(EEL_PME_EWALD(ic.eeltype) || ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype),
1362                "Unsupported eeltype with free energy");
1363     GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
1364
1365     // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
1366     GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
1367                        || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
1368                "We need actual padding with at least one element for SIMD scatter operations");
1369
1370     const auto& scParams                   = *ic.softCoreParameters;
1371     const bool  vdwInteractionTypeIsEwald  = (EVDW_PME(ic.vdwtype));
1372     const bool  elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
1373     const bool  vdwModifierIsPotSwitch     = (ic.vdw_modifier == InteractionModifiers::PotSwitch);
1374     const bool  computeForces              = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
1375     bool        scLambdasOrAlphasDiffer    = true;
1376
1377     if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
1378     {
1379         scLambdasOrAlphasDiffer = false;
1380     }
1381     else
1382     {
1383         if (lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1384                     == lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]
1385             && scParams.alphaCoulomb == scParams.alphaVdw)
1386         {
1387             scLambdasOrAlphasDiffer = false;
1388         }
1389     }
1390
1391     KernelFunction kernelFunc;
1392     kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer,
1393                                 vdwInteractionTypeIsEwald,
1394                                 elecInteractionTypeIsEwald,
1395                                 vdwModifierIsPotSwitch,
1396                                 computeForces,
1397                                 useSimd,
1398                                 ic);
1399     kernelFunc(nlist,
1400                coords,
1401                ntype,
1402                rlist,
1403                ic,
1404                shiftvec,
1405                nbfp,
1406                nbfp_grid,
1407                chargeA,
1408                chargeB,
1409                typeA,
1410                typeB,
1411                flags,
1412                lambda,
1413                nrnb,
1414                threadForceBuffer,
1415                threadForceShiftBuffer,
1416                threadVc,
1417                threadVv,
1418                threadDvdl);
1419 }