src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "nb_free_energy.h"
  41
  42 #include "config.h"
  43
  44 #include <cmath>
  45 #include <set>
  46
  47 #include <algorithm>
  48
  49 #include "gromacs/gmxlib/nrnb.h"
  50 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  51 #include "gromacs/math/arrayrefwithpadding.h"
  52 #include "gromacs/math/functions.h"
  53 #include "gromacs/math/vec.h"
  54 #include "gromacs/mdtypes/forceoutput.h"
  55 #include "gromacs/mdtypes/forcerec.h"
  56 #include "gromacs/mdtypes/interaction_const.h"
  57 #include "gromacs/mdtypes/md_enums.h"
  58 #include "gromacs/mdtypes/mdatom.h"
  59 #include "gromacs/mdtypes/nblist.h"
  60 #include "gromacs/pbcutil/ishift.h"
  61 #include "gromacs/simd/simd.h"
  62 #include "gromacs/simd/simd_math.h"
  63 #include "gromacs/utility/fatalerror.h"
  64 #include "gromacs/utility/arrayref.h"
  65
  66 #include "nb_softcore.h"
  67
  68 //! Scalar (non-SIMD) data types.
  69 struct ScalarDataTypes
  70 {
  71     using RealType = real; //!< The data type to use as real.
  72     using IntType  = int;  //!< The data type to use as int.
  73     using BoolType = bool; //!< The data type to use as bool for real value comparison.
  74     static constexpr int simdRealWidth = 1; //!< The width of the RealType.
  75     static constexpr int simdIntWidth  = 1; //!< The width of the IntType.
  76 };
  77
  78 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
  79 //! SIMD data types.
  80 struct SimdDataTypes
  81 {
  82     using RealType = gmx::SimdReal;  //!< The data type to use as real.
  83     using IntType  = gmx::SimdInt32; //!< The data type to use as int.
  84     using BoolType = gmx::SimdBool;  //!< The data type to use as bool for real value comparison.
  85     static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH; //!< The width of the RealType.
  86 #    if GMX_SIMD_HAVE_DOUBLE && GMX_DOUBLE
  87     static constexpr int simdIntWidth = GMX_SIMD_DINT32_WIDTH; //!< The width of the IntType.
  88 #    else
  89     static constexpr int simdIntWidth = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType.
  90 #    endif
  91 };
  92 #endif
  93
  94 /*! \brief Lower limit for square interaction distances in nonbonded kernels.
  95  *
  96  * This is a mimimum on r^2 to avoid overflows when computing r^6.
  97  * This will only affect results for soft-cored interaction at distances smaller
  98  * than 1e-6 and will limit extremely high foreign energies for overlapping atoms.
  99  * Note that we could use a somewhat smaller minimum in double precision.
 100  * But because invsqrt in double precision can use single precision, this number
 101  * can not be much smaller, we use the same number for simplicity.
 102  */
 103 constexpr real c_minDistanceSquared = 1.0e-12_real;
 104
 105 /*! \brief Higher limit for r^-6 used for Lennard-Jones interactions
 106  *
 107  * This is needed to avoid overflow of LJ energy and force terms for excluded
 108  * atoms and foreign energies of hard-core states of overlapping atoms.
 109  * Note that in single precision this value leaves room for C12 coefficients up to 3.4e8.
 110  */
 111 constexpr real c_maxRInvSix = 1.0e15_real;
 112
 113 template<bool computeForces, class RealType>
 114 static inline void
 115 pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType gmx_unused* force)
 116 {
 117     const RealType brsq = rSq * beta * beta;
 118     if constexpr (computeForces)
 119     {
 120         *force = -brsq * beta * gmx::pmeForceCorrection(brsq);
 121     }
 122     *pot = beta * gmx::pmePotentialCorrection(brsq);
 123 }
 124
 125 template<bool computeForces, class RealType, class BoolType>
 126 static inline void pmeLJCorrectionVF(const RealType rInv,
 127                                      const RealType rSq,
 128                                      const real     ewaldLJCoeffSq,
 129                                      const real     ewaldLJCoeffSixDivSix,
 130                                      RealType*      pot,
 131                                      RealType gmx_unused* force,
 132                                      const BoolType       mask,
 133                                      const BoolType       bIiEqJnr)
 134 {
 135     // We mask rInv to get zero force and potential for masked out pair interactions
 136     const RealType rInvSq  = rInv * rInv;
 137     const RealType rInvSix = rInvSq * rInvSq * rInvSq;
 138     // Mask rSq to avoid underflow in exp()
 139     const RealType coeffSqRSq       = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask);
 140     const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq);
 141     const RealType poly             = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq;
 142     if constexpr (computeForces)
 143     {
 144         *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
 145         *force = *force * rInvSq;
 146     }
 147     // The self interaction is the limit for r -> 0 which we need to compute separately
 148     *pot = gmx::blend(
 149             rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr);
 150 }
 151
 152 //! Computes r^(1/6) and 1/r^(1/6)
 153 template<class RealType>
 154 static inline void sixthRoot(const RealType r, RealType* sixthRoot, RealType* invSixthRoot)
 155 {
 156     RealType cbrtRes = gmx::cbrt(r);
 157     *invSixthRoot    = gmx::invsqrt(cbrtRes);
 158     *sixthRoot       = gmx::inv(*invSixthRoot);
 159 }
 160
 161 template<class RealType>
 162 static inline RealType calculateRinv6(const RealType rInvV)
 163 {
 164     RealType rInv6 = rInvV * rInvV;
 165     return (rInv6 * rInv6 * rInv6);
 166 }
 167
 168 template<class RealType>
 169 static inline RealType calculateVdw6(const RealType c6, const RealType rInv6)
 170 {
 171     return (c6 * rInv6);
 172 }
 173
 174 template<class RealType>
 175 static inline RealType calculateVdw12(const RealType c12, const RealType rInv6)
 176 {
 177     return (c12 * rInv6 * rInv6);
 178 }
 179
 180 /* reaction-field electrostatics */
 181 template<class RealType>
 182 static inline RealType reactionFieldScalarForce(const RealType qq,
 183                                                 const RealType rInv,
 184                                                 const RealType r,
 185                                                 const real     krf,
 186                                                 const real     two)
 187 {
 188     return (qq * (rInv - two * krf * r * r));
 189 }
 190 template<class RealType>
 191 static inline RealType reactionFieldPotential(const RealType qq,
 192                                               const RealType rInv,
 193                                               const RealType r,
 194                                               const real     krf,
 195                                               const real     potentialShift)
 196 {
 197     return (qq * (rInv + krf * r * r - potentialShift));
 198 }
 199
 200 /* Ewald electrostatics */
 201 template<class RealType>
 202 static inline RealType ewaldScalarForce(const RealType coulomb, const RealType rInv)
 203 {
 204     return (coulomb * rInv);
 205 }
 206 template<class RealType>
 207 static inline RealType ewaldPotential(const RealType coulomb, const RealType rInv, const real potentialShift)
 208 {
 209     return (coulomb * (rInv - potentialShift));
 210 }
 211
 212 /* cutoff LJ */
 213 template<class RealType>
 214 static inline RealType lennardJonesScalarForce(const RealType v6, const RealType v12)
 215 {
 216     return (v12 - v6);
 217 }
 218 template<class RealType>
 219 static inline RealType lennardJonesPotential(const RealType v6,
 220                                              const RealType v12,
 221                                              const RealType c6,
 222                                              const RealType c12,
 223                                              const real     repulsionShift,
 224                                              const real     dispersionShift,
 225                                              const real     oneSixth,
 226                                              const real     oneTwelfth)
 227 {
 228     return ((v12 + c12 * repulsionShift) * oneTwelfth - (v6 + c6 * dispersionShift) * oneSixth);
 229 }
 230
 231 /* Ewald LJ */
 232 template<class RealType>
 233 static inline RealType ewaldLennardJonesGridSubtract(const RealType c6grid,
 234                                                      const real     potentialShift,
 235                                                      const real     oneSixth)
 236 {
 237     return (c6grid * potentialShift * oneSixth);
 238 }
 239
 240 /* LJ Potential switch */
 241 template<class RealType, class BoolType>
 242 static inline RealType potSwitchScalarForceMod(const RealType fScalarInp,
 243                                                const RealType potential,
 244                                                const RealType sw,
 245                                                const RealType r,
 246                                                const RealType dsw,
 247                                                const BoolType mask)
 248 {
 249     /* The mask should select on rV < rVdw */
 250     return (gmx::selectByMask(fScalarInp * sw - r * potential * dsw, mask));
 251 }
 252 template<class RealType, class BoolType>
 253 static inline RealType potSwitchPotentialMod(const RealType potentialInp, const RealType sw, const BoolType mask)
 254 {
 255     /* The mask should select on rV < rVdw */
 256     return (gmx::selectByMask(potentialInp * sw, mask));
 257 }
 258
 259
 260 //! Templated free-energy non-bonded kernel
 261 template<typename DataTypes, KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
 262 static void nb_free_energy_kernel(const t_nblist&                                  nlist,
 263                                   const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
 264                                   const int                                        ntype,
 265                                   const real                                       rlist,
 266                                   const interaction_const_t&                       ic,
 267                                   gmx::ArrayRef<const gmx::RVec>                   shiftvec,
 268                                   gmx::ArrayRef<const real>                        nbfp,
 269                                   gmx::ArrayRef<const real> gmx_unused             nbfp_grid,
 270                                   gmx::ArrayRef<const real>                        chargeA,
 271                                   gmx::ArrayRef<const real>                        chargeB,
 272                                   gmx::ArrayRef<const int>                         typeA,
 273                                   gmx::ArrayRef<const int>                         typeB,
 274                                   int                                              flags,
 275                                   gmx::ArrayRef<const real>                        lambda,
 276                                   t_nrnb* gmx_restrict                             nrnb,
 277                                   gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
 278                                   rvec gmx_unused*    threadForceShiftBuffer,
 279                                   gmx::ArrayRef<real> threadVc,
 280                                   gmx::ArrayRef<real> threadVv,
 281                                   gmx::ArrayRef<real> threadDvdl)
 282 {
 283 #define STATE_A 0
 284 #define STATE_B 1
 285 #define NSTATES 2
 286
 287     using RealType = typename DataTypes::RealType;
 288     using IntType  = typename DataTypes::IntType;
 289     using BoolType = typename DataTypes::BoolType;
 290
 291     constexpr real oneTwelfth = 1.0_real / 12.0_real;
 292     constexpr real oneSixth   = 1.0_real / 6.0_real;
 293     constexpr real zero       = 0.0_real;
 294     constexpr real half       = 0.5_real;
 295     constexpr real one        = 1.0_real;
 296     constexpr real two        = 2.0_real;
 297     constexpr real six        = 6.0_real;
 298
 299     // Extract pair list data
 300     const int                nri    = nlist.nri;
 301     gmx::ArrayRef<const int> iinr   = nlist.iinr;
 302     gmx::ArrayRef<const int> jindex = nlist.jindex;
 303     gmx::ArrayRef<const int> jjnr   = nlist.jjnr;
 304     gmx::ArrayRef<const int> shift  = nlist.shift;
 305     gmx::ArrayRef<const int> gid    = nlist.gid;
 306
 307     const real lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
 308     const real lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
 309
 310     // Extract softcore parameters
 311     const auto&           scParams   = *ic.softCoreParameters;
 312     const real            lam_power  = scParams.lambdaPower;
 313     const real gmx_unused alpha_coul = scParams.alphaCoulomb;
 314     const real gmx_unused alpha_vdw  = scParams.alphaVdw;
 315     const real gmx_unused sigma6_def = scParams.sigma6WithInvalidSigma;
 316     const real gmx_unused sigma6_min = scParams.sigma6Minimum;
 317
 318     const real gmx_unused scaleLinpointCoulGapsys = scParams.scaleLinpointCoulGapsys;
 319     const real gmx_unused scaleLinpointVdWGapsys  = scParams.scaleLinpointVdWGapsys;
 320     const real gmx_unused sigma6VdWGapsys         = scParams.sigma6VdWGapsys;
 321
 322     const bool gmx_unused doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
 323     const bool            doPotential   = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
 324
 325     // Extract data from interaction_const_t
 326     const real            facel           = ic.epsfac;
 327     const real            rCoulomb        = ic.rcoulomb;
 328     const real            krf             = ic.reactionFieldCoefficient;
 329     const real            crf             = ic.reactionFieldShift;
 330     const real gmx_unused shLjEwald       = ic.sh_lj_ewald;
 331     const real            rVdw            = ic.rvdw;
 332     const real            dispersionShift = ic.dispersion_shift.cpot;
 333     const real            repulsionShift  = ic.repulsion_shift.cpot;
 334     const real            ewaldBeta       = ic.ewaldcoeff_q;
 335     real gmx_unused       ewaldLJCoeffSq;
 336     real gmx_unused       ewaldLJCoeffSixDivSix;
 337     if constexpr (vdwInteractionTypeIsEwald)
 338     {
 339         ewaldLJCoeffSq        = ic.ewaldcoeff_lj * ic.ewaldcoeff_lj;
 340         ewaldLJCoeffSixDivSix = ewaldLJCoeffSq * ewaldLJCoeffSq * ewaldLJCoeffSq / six;
 341     }
 342
 343     // Note that the nbnxm kernels do not support Coulomb potential switching at all
 344     GMX_ASSERT(ic.coulomb_modifier != InteractionModifiers::PotSwitch,
 345                "Potential switching is not supported for Coulomb with FEP");
 346
 347     const real      rVdwSwitch = ic.rvdw_switch;
 348     real gmx_unused vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
 349     if constexpr (vdwModifierIsPotSwitch)
 350     {
 351         const real d = rVdw - rVdwSwitch;
 352         vdw_swV3     = -10.0_real / (d * d * d);
 353         vdw_swV4     = 15.0_real / (d * d * d * d);
 354         vdw_swV5     = -6.0_real / (d * d * d * d * d);
 355         vdw_swF2     = -30.0_real / (d * d * d);
 356         vdw_swF3     = 60.0_real / (d * d * d * d);
 357         vdw_swF4     = -30.0_real / (d * d * d * d * d);
 358     }
 359     else
 360     {
 361         /* Avoid warnings from stupid compilers (looking at you, Clang!) */
 362         vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = zero;
 363     }
 364
 365     NbkernelElecType icoul;
 366     if (ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype))
 367     {
 368         icoul = NbkernelElecType::ReactionField;
 369     }
 370     else
 371     {
 372         icoul = NbkernelElecType::None;
 373     }
 374
 375     real rcutoff_max2                 = std::max(ic.rcoulomb, ic.rvdw);
 376     rcutoff_max2                      = rcutoff_max2 * rcutoff_max2;
 377     const real gmx_unused rCutoffCoul = ic.rcoulomb;
 378
 379     real gmx_unused sh_ewald = 0;
 380     if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
 381     {
 382         sh_ewald = ic.sh_ewald;
 383     }
 384
 385     /* For Ewald/PME interactions we cannot easily apply the soft-core component to
 386      * reciprocal space. When we use non-switched Ewald interactions, we
 387      * assume the soft-coring does not significantly affect the grid contribution
 388      * and apply the soft-core only to the full 1/r (- shift) pair contribution.
 389      *
 390      * However, we cannot use this approach for switch-modified since we would then
 391      * effectively end up evaluating a significantly different interaction here compared to the
 392      * normal (non-free-energy) kernels, either by applying a cutoff at a different
 393      * position than what the user requested, or by switching different
 394      * things (1/r rather than short-range Ewald). For these settings, we just
 395      * use the traditional short-range Ewald interaction in that case.
 396      */
 397     GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
 398                        "Can not apply soft-core to switched Ewald potentials");
 399
 400     const RealType            minDistanceSquared(c_minDistanceSquared);
 401     const RealType            maxRInvSix(c_maxRInvSix);
 402     const RealType gmx_unused floatMin(GMX_FLOAT_MIN);
 403
 404     RealType dvdlCoul(zero);
 405     RealType dvdlVdw(zero);
 406
 407     /* Lambda factor for state A, 1-lambda*/
 408     real LFC[NSTATES], LFV[NSTATES];
 409     LFC[STATE_A] = one - lambda_coul;
 410     LFV[STATE_A] = one - lambda_vdw;
 411
 412     /* Lambda factor for state B, lambda*/
 413     LFC[STATE_B] = lambda_coul;
 414     LFV[STATE_B] = lambda_vdw;
 415
 416     /*derivative of the lambda factor for state A and B */
 417     real DLF[NSTATES];
 418     DLF[STATE_A] = -one;
 419     DLF[STATE_B] = one;
 420
 421     real gmx_unused lFacCoul[NSTATES], dlFacCoul[NSTATES], lFacVdw[NSTATES], dlFacVdw[NSTATES];
 422     constexpr real  sc_r_power = six;
 423     for (int i = 0; i < NSTATES; i++)
 424     {
 425         lFacCoul[i]  = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i]));
 426         dlFacCoul[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFC[i]) : 1);
 427         lFacVdw[i]   = (lam_power == 2 ? (1 - LFV[i]) * (1 - LFV[i]) : (1 - LFV[i]));
 428         dlFacVdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
 429     }
 430
 431     // We need pointers to real for SIMD access
 432     const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
 433     real* gmx_restrict       forceRealPtr;
 434     if constexpr (computeForces)
 435     {
 436         GMX_ASSERT(nri == 0 || !threadForceBuffer.empty(), "need a valid threadForceBuffer");
 437
 438         forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
 439
 440         if (doShiftForces)
 441         {
 442             GMX_ASSERT(threadForceShiftBuffer != nullptr, "need a valid threadForceShiftBuffer");
 443         }
 444     }
 445
 446     const real rlistSquared = gmx::square(rlist);
 447
 448     bool haveExcludedPairsBeyondRlist = false;
 449
 450     for (int n = 0; n < nri; n++)
 451     {
 452         bool havePairsWithinCutoff = false;
 453
 454         const int  is   = shift[n];
 455         const real shX  = shiftvec[is][XX];
 456         const real shY  = shiftvec[is][YY];
 457         const real shZ  = shiftvec[is][ZZ];
 458         const int  nj0  = jindex[n];
 459         const int  nj1  = jindex[n + 1];
 460         const int  ii   = iinr[n];
 461         const int  ii3  = 3 * ii;
 462         const real ix   = shX + x[ii3 + 0];
 463         const real iy   = shY + x[ii3 + 1];
 464         const real iz   = shZ + x[ii3 + 2];
 465         const real iqA  = facel * chargeA[ii];
 466         const real iqB  = facel * chargeB[ii];
 467         const int  ntiA = ntype * typeA[ii];
 468         const int  ntiB = ntype * typeB[ii];
 469         RealType   vCTot(0);
 470         RealType   vVTot(0);
 471         RealType   fIX(0);
 472         RealType   fIY(0);
 473         RealType   fIZ(0);
 474
 475 #if GMX_SIMD_HAVE_REAL
 476         alignas(GMX_SIMD_ALIGNMENT) int            preloadIi[DataTypes::simdRealWidth];
 477         alignas(GMX_SIMD_ALIGNMENT) int gmx_unused preloadIs[DataTypes::simdRealWidth];
 478 #else
 479         int            preloadIi[DataTypes::simdRealWidth];
 480         int gmx_unused preloadIs[DataTypes::simdRealWidth];
 481 #endif
 482         for (int s = 0; s < DataTypes::simdRealWidth; s++)
 483         {
 484             preloadIi[s] = ii;
 485             preloadIs[s] = shift[n];
 486         }
 487         IntType ii_s = gmx::load<IntType>(preloadIi);
 488
 489         for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
 490         {
 491             RealType r, rInv;
 492
 493 #if GMX_SIMD_HAVE_REAL
 494             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIsValid[DataTypes::simdRealWidth];
 495             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIncluded[DataTypes::simdRealWidth];
 496             alignas(GMX_SIMD_ALIGNMENT) int32_t preloadJnr[DataTypes::simdRealWidth];
 497             alignas(GMX_SIMD_ALIGNMENT) int32_t typeIndices[NSTATES][DataTypes::simdRealWidth];
 498             alignas(GMX_SIMD_ALIGNMENT) real    preloadQq[NSTATES][DataTypes::simdRealWidth];
 499             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
 500             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
 501             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
 502             alignas(GMX_SIMD_ALIGNMENT)
 503                     real gmx_unused preloadScaleLinpointVdWGapsys[DataTypes::simdRealWidth];
 504             alignas(GMX_SIMD_ALIGNMENT)
 505                     real gmx_unused preloadScaleLinpointCoulGapsys[DataTypes::simdRealWidth];
 506             alignas(GMX_SIMD_ALIGNMENT)
 507                     real gmx_unused preloadSigma6VdWGapsys[NSTATES][DataTypes::simdRealWidth];
 508             alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
 509 #else
 510             real            preloadPairIsValid[DataTypes::simdRealWidth];
 511             real            preloadPairIncluded[DataTypes::simdRealWidth];
 512             int             preloadJnr[DataTypes::simdRealWidth];
 513             int             typeIndices[NSTATES][DataTypes::simdRealWidth];
 514             real            preloadQq[NSTATES][DataTypes::simdRealWidth];
 515             real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
 516             real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
 517             real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
 518             real gmx_unused preloadScaleLinpointVdWGapsys[DataTypes::simdRealWidth];
 519             real gmx_unused preloadScaleLinpointCoulGapsys[DataTypes::simdRealWidth];
 520             real gmx_unused preloadSigma6VdWGapsys[NSTATES][DataTypes::simdRealWidth];
 521             real            preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
 522 #endif
 523             for (int s = 0; s < DataTypes::simdRealWidth; s++)
 524             {
 525                 if (k + s < nj1)
 526                 {
 527                     preloadPairIsValid[s] = true;
 528                     /* Check if this pair on the exclusions list.*/
 529                     preloadPairIncluded[s]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + s]);
 530                     const int jnr           = jjnr[k + s];
 531                     preloadJnr[s]           = jnr;
 532                     typeIndices[STATE_A][s] = ntiA + typeA[jnr];
 533                     typeIndices[STATE_B][s] = ntiB + typeB[jnr];
 534                     preloadQq[STATE_A][s]   = iqA * chargeA[jnr];
 535                     preloadQq[STATE_B][s]   = iqB * chargeB[jnr];
 536
 537                     for (int i = 0; i < NSTATES; i++)
 538                     {
 539                         if constexpr (vdwInteractionTypeIsEwald)
 540                         {
 541                             preloadLjPmeC6Grid[i][s] = nbfp_grid[2 * typeIndices[i][s]];
 542                         }
 543                         else
 544                         {
 545                             preloadLjPmeC6Grid[i][s] = 0;
 546                         }
 547                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 548                         {
 549                             const real c6  = nbfp[2 * typeIndices[i][s]];
 550                             const real c12 = nbfp[2 * typeIndices[i][s] + 1];
 551                             if (c6 > 0 && c12 > 0)
 552                             {
 553                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
 554                                 preloadSigma6[i][s] = 0.5_real * c12 / c6;
 555                                 if (preloadSigma6[i][s]
 556                                     < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
 557                                 {
 558                                     preloadSigma6[i][s] = sigma6_min;
 559                                 }
 560                             }
 561                             else
 562                             {
 563                                 preloadSigma6[i][s] = sigma6_def;
 564                             }
 565                         }
 566                         if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 567                         {
 568                             const real c6  = nbfp[2 * typeIndices[i][s]];
 569                             const real c12 = nbfp[2 * typeIndices[i][s] + 1];
 570                             if (c6 > 0 && c12 > 0)
 571                             {
 572                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
 573                                 preloadSigma6VdWGapsys[i][s] = 0.5_real * c12 / c6;
 574                                 if (preloadSigma6VdWGapsys[i][s] < sigma6VdWGapsys)
 575                                 {
 576                                     preloadSigma6VdWGapsys[i][s] = sigma6VdWGapsys;
 577                                 }
 578                             }
 579                             else
 580                             {
 581                                 preloadSigma6VdWGapsys[i][s] = sigma6VdWGapsys;
 582                             }
 583                         }
 584                     }
 585                     if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 586                     {
 587                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
 588                         const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1];
 589                         const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1];
 590                         if (c12A > 0 && c12B > 0)
 591                         {
 592                             preloadAlphaVdwEff[s]  = 0;
 593                             preloadAlphaCoulEff[s] = 0;
 594                         }
 595                         else
 596                         {
 597                             preloadAlphaVdwEff[s]  = alpha_vdw;
 598                             preloadAlphaCoulEff[s] = alpha_coul;
 599                         }
 600                     }
 601                     if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 602                     {
 603                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
 604                         const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1];
 605                         const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1];
 606                         if (c12A > 0 && c12B > 0)
 607                         {
 608                             preloadScaleLinpointVdWGapsys[s]  = 0;
 609                             preloadScaleLinpointCoulGapsys[s] = 0;
 610                         }
 611                         else
 612                         {
 613                             preloadScaleLinpointVdWGapsys[s]  = scaleLinpointVdWGapsys;
 614                             preloadScaleLinpointCoulGapsys[s] = scaleLinpointCoulGapsys;
 615                         }
 616                     }
 617                 }
 618                 else
 619                 {
 620                     preloadJnr[s]                     = jjnr[k];
 621                     preloadPairIsValid[s]             = false;
 622                     preloadPairIncluded[s]            = false;
 623                     preloadAlphaVdwEff[s]             = 0;
 624                     preloadAlphaCoulEff[s]            = 0;
 625                     preloadScaleLinpointVdWGapsys[s]  = 0;
 626                     preloadScaleLinpointCoulGapsys[s] = 0;
 627
 628                     for (int i = 0; i < NSTATES; i++)
 629                     {
 630                         typeIndices[STATE_A][s]      = ntiA + typeA[jjnr[k]];
 631                         typeIndices[STATE_B][s]      = ntiB + typeB[jjnr[k]];
 632                         preloadLjPmeC6Grid[i][s]     = 0;
 633                         preloadQq[i][s]              = 0;
 634                         preloadSigma6[i][s]          = 0;
 635                         preloadSigma6VdWGapsys[i][s] = 0;
 636                     }
 637                 }
 638             }
 639
 640             RealType jx, jy, jz;
 641             gmx::gatherLoadUTranspose<3>(reinterpret_cast<const real*>(x), preloadJnr, &jx, &jy, &jz);
 642
 643             const RealType pairIsValid   = gmx::load<RealType>(preloadPairIsValid);
 644             const RealType pairIncluded  = gmx::load<RealType>(preloadPairIncluded);
 645             const BoolType bPairIncluded = (pairIncluded != zero);
 646             const BoolType bPairExcluded = (pairIncluded == zero && pairIsValid != zero);
 647
 648             const RealType dX  = ix - jx;
 649             const RealType dY  = iy - jy;
 650             const RealType dZ  = iz - jz;
 651             RealType       rSq = dX * dX + dY * dY + dZ * dZ;
 652
 653             BoolType withinCutoffMask = (rSq < rcutoff_max2);
 654
 655             if (!gmx::anyTrue(withinCutoffMask || bPairExcluded))
 656             {
 657                 /* We save significant time by skipping all code below.
 658                  * Note that with soft-core interactions, the actual cut-off
 659                  * check might be different. But since the soft-core distance
 660                  * is always larger than r, checking on r here is safe.
 661                  * Exclusions outside the cutoff can not be skipped as
 662                  * when using Ewald: the reciprocal-space
 663                  * Ewald component still needs to be subtracted.
 664                  */
 665                 continue;
 666             }
 667             else
 668             {
 669                 havePairsWithinCutoff = true;
 670             }
 671
 672             if (gmx::anyTrue(rlistSquared < rSq && bPairExcluded))
 673             {
 674                 haveExcludedPairsBeyondRlist = true;
 675             }
 676
 677             const IntType  jnr_s    = gmx::load<IntType>(preloadJnr);
 678             const BoolType bIiEqJnr = gmx::cvtIB2B(ii_s == jnr_s);
 679
 680             RealType            c6[NSTATES];
 681             RealType            c12[NSTATES];
 682             RealType gmx_unused sigma6[NSTATES];
 683             RealType            qq[NSTATES];
 684             RealType gmx_unused ljPmeC6Grid[NSTATES];
 685             RealType gmx_unused alphaVdwEff;
 686             RealType gmx_unused alphaCoulEff;
 687             RealType gmx_unused scaleLinpointVdWGapsysEff;
 688             RealType gmx_unused scaleLinpointCoulGapsysEff;
 689             RealType gmx_unused sigmaVdWGapsysEff[NSTATES];
 690             for (int i = 0; i < NSTATES; i++)
 691             {
 692                 gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]);
 693                 qq[i]          = gmx::load<RealType>(preloadQq[i]);
 694                 ljPmeC6Grid[i] = gmx::load<RealType>(preloadLjPmeC6Grid[i]);
 695                 if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 696                 {
 697                     sigma6[i] = gmx::load<RealType>(preloadSigma6[i]);
 698                 }
 699                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 700                 {
 701                     sigmaVdWGapsysEff[i] = gmx::load<RealType>(preloadSigma6VdWGapsys[i]);
 702                 }
 703             }
 704             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 705             {
 706                 alphaVdwEff  = gmx::load<RealType>(preloadAlphaVdwEff);
 707                 alphaCoulEff = gmx::load<RealType>(preloadAlphaCoulEff);
 708             }
 709             if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 710             {
 711                 scaleLinpointVdWGapsysEff  = gmx::load<RealType>(preloadScaleLinpointVdWGapsys);
 712                 scaleLinpointCoulGapsysEff = gmx::load<RealType>(preloadScaleLinpointCoulGapsys);
 713             }
 714
 715             // Avoid overflow of r^-12 at distances near zero
 716             rSq  = gmx::max(rSq, minDistanceSquared);
 717             rInv = gmx::invsqrt(rSq);
 718             r    = rSq * rInv;
 719
 720             RealType gmx_unused rp, rpm2;
 721             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 722             {
 723                 rpm2 = rSq * rSq;  /* r4 */
 724                 rp   = rpm2 * rSq; /* r6 */
 725             }
 726             else
 727             {
 728                 /* The soft-core power p will not affect the results
 729                  * with not using soft-core, so we use power of 0 which gives
 730                  * the simplest math and cheapest code.
 731                  */
 732                 rpm2 = rInv * rInv;
 733                 rp   = one;
 734             }
 735
 736             RealType fScal(0);
 737
 738             /* The following block is masked to only calculate values having bPairIncluded. If
 739              * bPairIncluded is true then withinCutoffMask must also be true. */
 740             if (gmx::anyTrue(withinCutoffMask && bPairIncluded))
 741             {
 742                 RealType gmx_unused fScalC[NSTATES], fScalV[NSTATES];
 743                 RealType            vCoul[NSTATES], vVdw[NSTATES];
 744                 for (int i = 0; i < NSTATES; i++)
 745                 {
 746                     fScalC[i] = zero;
 747                     fScalV[i] = zero;
 748                     vCoul[i]  = zero;
 749                     vVdw[i]   = zero;
 750
 751                     RealType gmx_unused rInvC, rInvV, rC, rV, rPInvC, rPInvV;
 752
 753                     /* The following block is masked to require (qq[i] != 0 || c6[i] != 0 || c12[i]
 754                      * != 0) in addition to bPairIncluded, which in turn requires withinCutoffMask. */
 755                     BoolType nonZeroState = ((qq[i] != zero || c6[i] != zero || c12[i] != zero)
 756                                              && bPairIncluded && withinCutoffMask);
 757                     if (gmx::anyTrue(nonZeroState))
 758                     {
 759                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 760                         {
 761                             RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
 762                             rPInvC           = gmx::inv(divisor);
 763                             sixthRoot(rPInvC, &rInvC, &rC);
 764
 765                             if constexpr (scLambdasOrAlphasDiffer)
 766                             {
 767                                 RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
 768                                 rPInvV           = gmx::inv(divisor);
 769                                 sixthRoot(rPInvV, &rInvV, &rV);
 770                             }
 771                             else
 772                             {
 773                                 /* We can avoid one expensive pow and one / operation */
 774                                 rPInvV = rPInvC;
 775                                 rInvV  = rInvC;
 776                                 rV     = rC;
 777                             }
 778                         }
 779                         else
 780                         {
 781                             rPInvC = one;
 782                             rInvC  = rInv;
 783                             rC     = r;
 784
 785                             rPInvV = one;
 786                             rInvV  = rInv;
 787                             rV     = r;
 788                         }
 789
 790                         /* Only process the coulomb interactions if we either
 791                          * include all entries in the list (no cutoff
 792                          * used in the kernel), or if we are within the cutoff.
 793                          */
 794                         BoolType computeElecInteraction;
 795                         if constexpr (elecInteractionTypeIsEwald)
 796                         {
 797                             computeElecInteraction = (r < rCoulomb && qq[i] != zero && bPairIncluded);
 798                         }
 799                         else
 800                         {
 801                             computeElecInteraction = (rC < rCoulomb && qq[i] != zero && bPairIncluded);
 802                         }
 803                         if (gmx::anyTrue(computeElecInteraction))
 804                         {
 805                             if constexpr (elecInteractionTypeIsEwald)
 806                             {
 807                                 vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald);
 808                                 if constexpr (computeForces)
 809                                 {
 810                                     fScalC[i] = ewaldScalarForce(qq[i], rInvC);
 811                                 }
 812
 813                                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 814                                 {
 815                                     ewaldQuadraticPotential(qq[i],
 816                                                             facel,
 817                                                             rC,
 818                                                             rCutoffCoul,
 819                                                             LFC[i],
 820                                                             DLF[i],
 821                                                             scaleLinpointCoulGapsysEff,
 822                                                             sh_ewald,
 823                                                             &fScalC[i],
 824                                                             &vCoul[i],
 825                                                             &dvdlCoul,
 826                                                             computeElecInteraction);
 827                                 }
 828                             }
 829                             else
 830                             {
 831                                 vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
 832                                 if constexpr (computeForces)
 833                                 {
 834                                     fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
 835                                 }
 836
 837                                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 838                                 {
 839                                     reactionFieldQuadraticPotential(qq[i],
 840                                                                     facel,
 841                                                                     rC,
 842                                                                     rCutoffCoul,
 843                                                                     LFC[i],
 844                                                                     DLF[i],
 845                                                                     scaleLinpointCoulGapsysEff,
 846                                                                     krf,
 847                                                                     crf,
 848                                                                     &fScalC[i],
 849                                                                     &vCoul[i],
 850                                                                     &dvdlCoul,
 851                                                                     computeElecInteraction);
 852                                 }
 853                             }
 854
 855                             vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction);
 856                             if constexpr (computeForces)
 857                             {
 858                                 fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
 859                             }
 860                         }
 861
 862                         /* Only process the VDW interactions if we either
 863                          * include all entries in the list (no cutoff used
 864                          * in the kernel), or if we are within the cutoff.
 865                          */
 866                         BoolType computeVdwInteraction;
 867                         if constexpr (vdwInteractionTypeIsEwald)
 868                         {
 869                             computeVdwInteraction =
 870                                     (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
 871                         }
 872                         else
 873                         {
 874                             computeVdwInteraction =
 875                                     (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
 876                         }
 877                         if (gmx::anyTrue(computeVdwInteraction))
 878                         {
 879                             RealType rInv6;
 880                             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 881                             {
 882                                 rInv6 = rPInvV;
 883                             }
 884                             else
 885                             {
 886                                 rInv6 = calculateRinv6(rInvV);
 887                             }
 888                             // Avoid overflow at short distance for masked exclusions and
 889                             // for foreign energy calculations at a hard core end state.
 890                             // Note that we should limit r^-6, and thus also r^-12, and
 891                             // not only r^-12, as that could lead to erroneously low instead
 892                             // of very high foreign energies.
 893                             rInv6           = gmx::min(rInv6, maxRInvSix);
 894                             RealType vVdw6  = calculateVdw6(c6[i], rInv6);
 895                             RealType vVdw12 = calculateVdw12(c12[i], rInv6);
 896
 897                             vVdw[i] = lennardJonesPotential(
 898                                     vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth);
 899                             if constexpr (computeForces)
 900                             {
 901                                 fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
 902                             }
 903
 904                             if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 905                             {
 906                                 lennardJonesQuadraticPotential(c6[i],
 907                                                                c12[i],
 908                                                                r,
 909                                                                rSq,
 910                                                                LFV[i],
 911                                                                DLF[i],
 912                                                                sigmaVdWGapsysEff[i],
 913                                                                scaleLinpointVdWGapsysEff,
 914                                                                repulsionShift,
 915                                                                dispersionShift,
 916                                                                &fScalV[i],
 917                                                                &vVdw[i],
 918                                                                &dvdlVdw,
 919                                                                computeVdwInteraction);
 920                             }
 921
 922                             if constexpr (vdwInteractionTypeIsEwald)
 923                             {
 924                                 /* Subtract the grid potential at the cut-off */
 925                                 vVdw[i] = vVdw[i]
 926                                           + gmx::selectByMask(ewaldLennardJonesGridSubtract(
 927                                                                       ljPmeC6Grid[i], shLjEwald, oneSixth),
 928                                                               computeVdwInteraction);
 929                             }
 930
 931                             if constexpr (vdwModifierIsPotSwitch)
 932                             {
 933                                 RealType d             = rV - rVdwSwitch;
 934                                 BoolType zeroMask      = zero < d;
 935                                 BoolType potSwitchMask = rV < rVdw;
 936                                 d                      = gmx::selectByMask(d, zeroMask);
 937                                 const RealType d2      = d * d;
 938                                 const RealType sw =
 939                                         one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
 940
 941                                 if constexpr (computeForces)
 942                                 {
 943                                     const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
 944                                     fScalV[i]          = potSwitchScalarForceMod(
 945                                             fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
 946                                 }
 947                                 vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask);
 948                             }
 949
 950                             vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction);
 951                             if constexpr (computeForces)
 952                             {
 953                                 fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
 954                             }
 955                         }
 956
 957                         if constexpr (computeForces)
 958                         {
 959                             /* fScalC (and fScalV) now contain: dV/drC * rC
 960                              * Now we multiply by rC^-6, so it will be: dV/drC * rC^-5
 961                              * Further down we first multiply by r^4 and then by
 962                              * the vector r, which in total gives: dV/drC * (r/rC)^-5
 963                              */
 964                             fScalC[i] = fScalC[i] * rPInvC;
 965                             fScalV[i] = fScalV[i] * rPInvV;
 966                         }
 967                     } // end of block requiring nonZeroState
 968                 }     // end for (int i = 0; i < NSTATES; i++)
 969
 970                 /* Assemble A and B states. */
 971                 BoolType assembleStates = (bPairIncluded && withinCutoffMask);
 972                 if (gmx::anyTrue(assembleStates))
 973                 {
 974                     for (int i = 0; i < NSTATES; i++)
 975                     {
 976                         vCTot = vCTot + LFC[i] * vCoul[i];
 977                         vVTot = vVTot + LFV[i] * vVdw[i];
 978
 979                         if constexpr (computeForces)
 980                         {
 981                             fScal = fScal + LFC[i] * fScalC[i] * rpm2;
 982                             fScal = fScal + LFV[i] * fScalV[i] * rpm2;
 983                         }
 984
 985                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 986                         {
 987                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]
 988                                        + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i];
 989                             dvdlVdw = dvdlVdw + vVdw[i] * DLF[i]
 990                                       + LFV[i] * alphaVdwEff * dlFacVdw[i] * fScalV[i] * sigma6[i];
 991                         }
 992                         else
 993                         {
 994                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i];
 995                             dvdlVdw  = dvdlVdw + vVdw[i] * DLF[i];
 996                         }
 997                     }
 998                 }
 999             } // end of block requiring bPairIncluded && withinCutoffMask
1000             /* In the following block bPairIncluded should be false in the masks. */
1001             if (icoul == NbkernelElecType::ReactionField)
1002             {
1003                 const BoolType computeReactionField = bPairExcluded;
1004
1005                 if (gmx::anyTrue(computeReactionField))
1006                 {
1007                     /* For excluded pairs we don't use soft-core.
1008                      * As there is no singularity, there is no need for soft-core.
1009                      */
1010                     const RealType FF = -two * krf;
1011                     RealType       VV = krf * rSq - crf;
1012
1013                     /* If ii == jnr the i particle (ii) has itself (jnr)
1014                      * in its neighborlist. This corresponds to a self-interaction
1015                      * that will occur twice. Scale it down by 50% to only include
1016                      * it once.
1017                      */
1018                     VV = VV * gmx::blend(one, half, bIiEqJnr);
1019
1020                     for (int i = 0; i < NSTATES; i++)
1021                     {
1022                         vCTot = vCTot + gmx::selectByMask(LFC[i] * qq[i] * VV, computeReactionField);
1023                         fScal = fScal + gmx::selectByMask(LFC[i] * qq[i] * FF, computeReactionField);
1024                         dvdlCoul = dvdlCoul + gmx::selectByMask(DLF[i] * qq[i] * VV, computeReactionField);
1025                     }
1026                 }
1027             }
1028
1029             const BoolType computeElecEwaldInteraction = (bPairExcluded || r < rCoulomb);
1030             if (elecInteractionTypeIsEwald && gmx::anyTrue(computeElecEwaldInteraction))
1031             {
1032                 /* See comment in the preamble. When using Ewald interactions
1033                  * (unless we use a switch modifier) we subtract the reciprocal-space
1034                  * Ewald component here which made it possible to apply the free
1035                  * energy interaction to 1/r (vanilla coulomb short-range part)
1036                  * above. This gets us closer to the ideal case of applying
1037                  * the softcore to the entire electrostatic interaction,
1038                  * including the reciprocal-space component.
1039                  */
1040                 RealType v_lr, f_lr;
1041
1042                 pmeCoulombCorrectionVF<computeForces>(rSq, ewaldBeta, &v_lr, &f_lr);
1043                 if constexpr (computeForces)
1044                 {
1045                     f_lr = f_lr * rInv * rInv;
1046                 }
1047
1048                 /* Note that any possible Ewald shift has already been applied in
1049                  * the normal interaction part above.
1050                  */
1051
1052                 /* If ii == jnr the i particle (ii) has itself (jnr)
1053                  * in its neighborlist. This corresponds to a self-interaction
1054                  * that will occur twice. Scale it down by 50% to only include
1055                  * it once.
1056                  */
1057                 v_lr = v_lr * gmx::blend(one, half, bIiEqJnr);
1058
1059                 for (int i = 0; i < NSTATES; i++)
1060                 {
1061                     vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1062                     if constexpr (computeForces)
1063                     {
1064                         fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
1065                     }
1066                     dvdlCoul = dvdlCoul
1067                                - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1068                 }
1069             }
1070
1071             const BoolType computeVdwEwaldInteraction = (bPairExcluded || r < rVdw);
1072             if (vdwInteractionTypeIsEwald && gmx::anyTrue(computeVdwEwaldInteraction))
1073             {
1074                 /* See comment in the preamble. When using LJ-Ewald interactions
1075                  * (unless we use a switch modifier) we subtract the reciprocal-space
1076                  * Ewald component here which made it possible to apply the free
1077                  * energy interaction to r^-6 (vanilla LJ6 short-range part)
1078                  * above. This gets us closer to the ideal case of applying
1079                  * the softcore to the entire VdW interaction,
1080                  * including the reciprocal-space component.
1081                  */
1082
1083                 RealType v_lr, f_lr;
1084                 pmeLJCorrectionVF<computeForces>(
1085                         rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr);
1086                 v_lr = v_lr * oneSixth;
1087
1088                 for (int i = 0; i < NSTATES; i++)
1089                 {
1090                     vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1091                     if constexpr (computeForces)
1092                     {
1093                         fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
1094                     }
1095                     dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1096                 }
1097             }
1098
1099             if (computeForces && gmx::anyTrue(fScal != zero))
1100             {
1101                 const RealType tX = fScal * dX;
1102                 const RealType tY = fScal * dY;
1103                 const RealType tZ = fScal * dZ;
1104                 fIX               = fIX + tX;
1105                 fIY               = fIY + tY;
1106                 fIZ               = fIZ + tZ;
1107
1108                 gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
1109             }
1110         } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
1111
1112         if (havePairsWithinCutoff)
1113         {
1114             if constexpr (computeForces)
1115             {
1116                 gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
1117
1118                 if (doShiftForces)
1119                 {
1120                     gmx::transposeScatterIncrU<3>(
1121                             reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
1122                 }
1123             }
1124             if (doPotential)
1125             {
1126                 int ggid = gid[n];
1127                 threadVc[ggid] += gmx::reduce(vCTot);
1128                 threadVv[ggid] += gmx::reduce(vVTot);
1129             }
1130         }
1131     } // end for (int n = 0; n < nri; n++)
1132
1133     if (gmx::anyTrue(dvdlCoul != zero))
1134     {
1135         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)] += gmx::reduce(dvdlCoul);
1136     }
1137     if (gmx::anyTrue(dvdlVdw != zero))
1138     {
1139         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)] += gmx::reduce(dvdlVdw);
1140     }
1141
1142     /* Estimate flops, average for free energy stuff:
1143      * 12  flops per outer iteration
1144      * 150 flops per inner iteration
1145      * TODO: Update the number of flops and/or use different counts for different code paths.
1146      */
1147     atomicNrnbIncrement(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist.nri * 12 + nlist.jindex[nri] * 150);
1148
1149     if (haveExcludedPairsBeyondRlist > 0)
1150     {
1151         gmx_fatal(FARGS,
1152                   "There are perturbed non-bonded pair interactions beyond the pair-list cutoff "
1153                   "of %g nm, which is not supported. This can happen because the system is "
1154                   "unstable or because intra-molecular interactions at long distances are "
1155                   "excluded. If the "
1156                   "latter is the case, you can try to increase nstlist or rlist to avoid this."
1157                   "The error is likely triggered by the use of couple-intramol=no "
1158                   "and the maximal distance in the decoupled molecule exceeding rlist.",
1159                   rlist);
1160     }
1161 }
1162
1163 typedef void (*KernelFunction)(const t_nblist&                                  nlist,
1164                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1165                                const int                                        ntype,
1166                                const real                                       rlist,
1167                                const interaction_const_t&                       ic,
1168                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1169                                gmx::ArrayRef<const real>                        nbfp,
1170                                gmx::ArrayRef<const real>                        nbfp_grid,
1171                                gmx::ArrayRef<const real>                        chargeA,
1172                                gmx::ArrayRef<const real>                        chargeB,
1173                                gmx::ArrayRef<const int>                         typeA,
1174                                gmx::ArrayRef<const int>                         typeB,
1175                                int                                              flags,
1176                                gmx::ArrayRef<const real>                        lambda,
1177                                t_nrnb* gmx_restrict                             nrnb,
1178                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1179                                rvec*               threadForceShiftBuffer,
1180                                gmx::ArrayRef<real> threadVc,
1181                                gmx::ArrayRef<real> threadVv,
1182                                gmx::ArrayRef<real> threadDvdl);
1183
1184 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
1185 static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
1186 {
1187     if (useSimd)
1188     {
1189 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
1190         return (nb_free_energy_kernel<SimdDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1191 #else
1192         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch computeForces>);
1193 #endif
1194     }
1195     else
1196     {
1197         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1198     }
1199 }
1200
1201 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
1202 static KernelFunction dispatchKernelOnComputeForces(const bool computeForces, const bool useSimd)
1203 {
1204     if (computeForces)
1205     {
1206         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, true>(
1207                 useSimd));
1208     }
1209     else
1210     {
1211         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, false>(
1212                 useSimd));
1213     }
1214 }
1215
1216 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
1217 static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch,
1218                                                   const bool computeForces,
1219                                                   const bool useSimd)
1220 {
1221     if (vdwModifierIsPotSwitch)
1222     {
1223         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
1224                 computeForces, useSimd));
1225     }
1226     else
1227     {
1228         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
1229                 computeForces, useSimd));
1230     }
1231 }
1232
1233 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
1234 static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
1235                                                           const bool vdwModifierIsPotSwitch,
1236                                                           const bool computeForces,
1237                                                           const bool useSimd)
1238 {
1239     if (elecInteractionTypeIsEwald)
1240     {
1241         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
1242                 vdwModifierIsPotSwitch, computeForces, useSimd));
1243     }
1244     else
1245     {
1246         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
1247                 vdwModifierIsPotSwitch, computeForces, useSimd));
1248     }
1249 }
1250
1251 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer>
1252 static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
1253                                                          const bool elecInteractionTypeIsEwald,
1254                                                          const bool vdwModifierIsPotSwitch,
1255                                                          const bool computeForces,
1256                                                          const bool useSimd)
1257 {
1258     if (vdwInteractionTypeIsEwald)
1259     {
1260         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, true>(
1261                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1262     }
1263     else
1264     {
1265         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, false>(
1266                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1267     }
1268 }
1269
1270 template<KernelSoftcoreType softcoreType>
1271 static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
1272                                                                   const bool vdwInteractionTypeIsEwald,
1273                                                                   const bool elecInteractionTypeIsEwald,
1274                                                                   const bool vdwModifierIsPotSwitch,
1275                                                                   const bool computeForces,
1276                                                                   const bool useSimd)
1277 {
1278     if (scLambdasOrAlphasDiffer)
1279     {
1280         return (dispatchKernelOnVdwInteractionType<softcoreType, true>(
1281                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1282     }
1283     else
1284     {
1285         return (dispatchKernelOnVdwInteractionType<softcoreType, false>(
1286                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1287     }
1288 }
1289
1290 static KernelFunction dispatchKernel(const bool                 scLambdasOrAlphasDiffer,
1291                                      const bool                 vdwInteractionTypeIsEwald,
1292                                      const bool                 elecInteractionTypeIsEwald,
1293                                      const bool                 vdwModifierIsPotSwitch,
1294                                      const bool                 computeForces,
1295                                      const bool                 useSimd,
1296                                      const interaction_const_t& ic)
1297 {
1298     const auto& scParams = *ic.softCoreParameters;
1299     if (scParams.softcoreType == SoftcoreType::Beutler)
1300     {
1301         if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
1302         {
1303             return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
1304                     scLambdasOrAlphasDiffer,
1305                     vdwInteractionTypeIsEwald,
1306                     elecInteractionTypeIsEwald,
1307                     vdwModifierIsPotSwitch,
1308                     computeForces,
1309                     useSimd));
1310         }
1311         return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Beutler>(
1312                 scLambdasOrAlphasDiffer,
1313                 vdwInteractionTypeIsEwald,
1314                 elecInteractionTypeIsEwald,
1315                 vdwModifierIsPotSwitch,
1316                 computeForces,
1317                 useSimd));
1318     }
1319     else // Gapsys
1320     {
1321         if (scParams.scaleLinpointCoulGapsys == 0 && scParams.scaleLinpointVdWGapsys == 0)
1322         {
1323             return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
1324                     scLambdasOrAlphasDiffer,
1325                     vdwInteractionTypeIsEwald,
1326                     elecInteractionTypeIsEwald,
1327                     vdwModifierIsPotSwitch,
1328                     computeForces,
1329                     useSimd));
1330         }
1331         return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Gapsys>(
1332                 scLambdasOrAlphasDiffer,
1333                 vdwInteractionTypeIsEwald,
1334                 elecInteractionTypeIsEwald,
1335                 vdwModifierIsPotSwitch,
1336                 computeForces,
1337                 useSimd));
1338     }
1339 }
1340
1341
1342 void gmx_nb_free_energy_kernel(const t_nblist&                                  nlist,
1343                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1344                                const bool                                       useSimd,
1345                                const int                                        ntype,
1346                                const real                                       rlist,
1347                                const interaction_const_t&                       ic,
1348                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1349                                gmx::ArrayRef<const real>                        nbfp,
1350                                gmx::ArrayRef<const real>                        nbfp_grid,
1351                                gmx::ArrayRef<const real>                        chargeA,
1352                                gmx::ArrayRef<const real>                        chargeB,
1353                                gmx::ArrayRef<const int>                         typeA,
1354                                gmx::ArrayRef<const int>                         typeB,
1355                                int                                              flags,
1356                                gmx::ArrayRef<const real>                        lambda,
1357                                t_nrnb*                                          nrnb,
1358                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1359                                rvec*               threadForceShiftBuffer,
1360                                gmx::ArrayRef<real> threadVc,
1361                                gmx::ArrayRef<real> threadVv,
1362                                gmx::ArrayRef<real> threadDvdl)
1363 {
1364     GMX_ASSERT(EEL_PME_EWALD(ic.eeltype) || ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype),
1365                "Unsupported eeltype with free energy");
1366     GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
1367
1368     // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
1369     GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
1370                        || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
1371                "We need actual padding with at least one element for SIMD scatter operations");
1372
1373     const auto& scParams                   = *ic.softCoreParameters;
1374     const bool  vdwInteractionTypeIsEwald  = (EVDW_PME(ic.vdwtype));
1375     const bool  elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
1376     const bool  vdwModifierIsPotSwitch     = (ic.vdw_modifier == InteractionModifiers::PotSwitch);
1377     const bool  computeForces              = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
1378     bool        scLambdasOrAlphasDiffer    = true;
1379
1380     if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
1381     {
1382         scLambdasOrAlphasDiffer = false;
1383     }
1384     else
1385     {
1386         if (lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1387                     == lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]
1388             && scParams.alphaCoulomb == scParams.alphaVdw)
1389         {
1390             scLambdasOrAlphasDiffer = false;
1391         }
1392     }
1393
1394     KernelFunction kernelFunc;
1395     kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer,
1396                                 vdwInteractionTypeIsEwald,
1397                                 elecInteractionTypeIsEwald,
1398                                 vdwModifierIsPotSwitch,
1399                                 computeForces,
1400                                 useSimd,
1401                                 ic);
1402     kernelFunc(nlist,
1403                coords,
1404                ntype,
1405                rlist,
1406                ic,
1407                shiftvec,
1408                nbfp,
1409                nbfp_grid,
1410                chargeA,
1411                chargeB,
1412                typeA,
1413                typeB,
1414                flags,
1415                lambda,
1416                nrnb,
1417                threadForceBuffer,
1418                threadForceShiftBuffer,
1419                threadVc,
1420                threadVv,
1421                threadDvdl);
1422 }