src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   5  * Copyright (c) 2001-2004, The GROMACS development team.
   6  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
   7  * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   8  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   9  * and including many others, as listed in the AUTHORS file in the
  10  * top-level source directory and at http://www.gromacs.org.
  11  *
  12  * GROMACS is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public License
  14  * as published by the Free Software Foundation; either version 2.1
  15  * of the License, or (at your option) any later version.
  16  *
  17  * GROMACS is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with GROMACS; if not, see
  24  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  25  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  26  *
  27  * If you want to redistribute modifications to GROMACS, please
  28  * consider that scientific software is very special. Version
  29  * control is crucial - bugs must be traceable. We will be happy to
  30  * consider code for inclusion in the official distribution, but
  31  * derived work must not be called official GROMACS. Details are found
  32  * in the README & COPYING files - if they are missing, get the
  33  * official version at http://www.gromacs.org.
  34  *
  35  * To help us fund GROMACS development, we humbly ask that you cite
  36  * the research papers on the package. Check out http://www.gromacs.org.
  37  */
  38 #include "gmxpre.h"
  39
  40 #include "nb_free_energy.h"
  41
  42 #include "config.h"
  43
  44 #include <cmath>
  45 #include <set>
  46
  47 #include <algorithm>
  48
  49 #include "gromacs/gmxlib/nrnb.h"
  50 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
  51 #include "gromacs/math/arrayrefwithpadding.h"
  52 #include "gromacs/math/functions.h"
  53 #include "gromacs/math/vec.h"
  54 #include "gromacs/mdtypes/forceoutput.h"
  55 #include "gromacs/mdtypes/forcerec.h"
  56 #include "gromacs/mdtypes/interaction_const.h"
  57 #include "gromacs/mdtypes/md_enums.h"
  58 #include "gromacs/mdtypes/mdatom.h"
  59 #include "gromacs/mdtypes/nblist.h"
  60 #include "gromacs/pbcutil/ishift.h"
  61 #include "gromacs/simd/simd.h"
  62 #include "gromacs/simd/simd_math.h"
  63 #include "gromacs/utility/fatalerror.h"
  64 #include "gromacs/utility/arrayref.h"
  65
  66 #include "nb_softcore.h"
  67
  68 //! Scalar (non-SIMD) data types.
  69 struct ScalarDataTypes
  70 {
  71     using RealType = real; //!< The data type to use as real.
  72     using IntType  = int;  //!< The data type to use as int.
  73     using BoolType = bool; //!< The data type to use as bool for real value comparison.
  74     static constexpr int simdRealWidth = 1; //!< The width of the RealType.
  75     static constexpr int simdIntWidth  = 1; //!< The width of the IntType.
  76 };
  77
  78 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
  79 //! SIMD data types.
  80 struct SimdDataTypes
  81 {
  82     using RealType = gmx::SimdReal;  //!< The data type to use as real.
  83     using IntType  = gmx::SimdInt32; //!< The data type to use as int.
  84     using BoolType = gmx::SimdBool;  //!< The data type to use as bool for real value comparison.
  85     static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH; //!< The width of the RealType.
  86 #    if GMX_SIMD_HAVE_DOUBLE && GMX_DOUBLE
  87     static constexpr int simdIntWidth = GMX_SIMD_DINT32_WIDTH; //!< The width of the IntType.
  88 #    else
  89     static constexpr int simdIntWidth = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType.
  90 #    endif
  91 };
  92 #endif
  93
  94 /*! \brief Lower limit for square interaction distances in nonbonded kernels.
  95  *
  96  * This is a mimimum on r^2 to avoid overflows when computing r^6.
  97  * This will only affect results for soft-cored interaction at distances smaller
  98  * than 1e-6 and will limit extremely high foreign energies for overlapping atoms.
  99  * Note that we could use a somewhat smaller minimum in double precision.
 100  * But because invsqrt in double precision can use single precision, this number
 101  * can not be much smaller, we use the same number for simplicity.
 102  */
 103 constexpr real c_minDistanceSquared = 1.0e-12_real;
 104
 105 /*! \brief Higher limit for r^-6 used for Lennard-Jones interactions
 106  *
 107  * This is needed to avoid overflow of LJ energy and force terms for excluded
 108  * atoms and foreign energies of hard-core states of overlapping atoms.
 109  * Note that in single precision this value leaves room for C12 coefficients up to 3.4e8.
 110  */
 111 constexpr real c_maxRInvSix = 1.0e15_real;
 112
 113 template<bool computeForces, class RealType>
 114 static inline void
 115 pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType gmx_unused* force)
 116 {
 117     const RealType brsq = rSq * beta * beta;
 118     if constexpr (computeForces)
 119     {
 120         *force = -brsq * beta * gmx::pmeForceCorrection(brsq);
 121     }
 122     *pot = beta * gmx::pmePotentialCorrection(brsq);
 123 }
 124
 125 template<bool computeForces, class RealType, class BoolType>
 126 static inline void pmeLJCorrectionVF(const RealType rInv,
 127                                      const RealType rSq,
 128                                      const real     ewaldLJCoeffSq,
 129                                      const real     ewaldLJCoeffSixDivSix,
 130                                      RealType*      pot,
 131                                      RealType gmx_unused* force,
 132                                      const BoolType       mask,
 133                                      const BoolType       bIiEqJnr)
 134 {
 135     // We mask rInv to get zero force and potential for masked out pair interactions
 136     const RealType rInvSq  = rInv * rInv;
 137     const RealType rInvSix = rInvSq * rInvSq * rInvSq;
 138     // Mask rSq to avoid underflow in exp()
 139     const RealType coeffSqRSq       = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask);
 140     const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq);
 141     const RealType poly             = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq;
 142     if constexpr (computeForces)
 143     {
 144         *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
 145         *force = *force * rInvSq;
 146     }
 147     // The self interaction is the limit for r -> 0 which we need to compute separately
 148     *pot = gmx::blend(
 149             rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr);
 150 }
 151
 152 //! Computes r^(1/6) and 1/r^(1/6)
 153 template<class RealType>
 154 static inline void sixthRoot(const RealType r, RealType* sixthRoot, RealType* invSixthRoot)
 155 {
 156     RealType cbrtRes = gmx::cbrt(r);
 157     *invSixthRoot    = gmx::invsqrt(cbrtRes);
 158     *sixthRoot       = gmx::inv(*invSixthRoot);
 159 }
 160
 161 template<class RealType>
 162 static inline RealType calculateRinv6(const RealType rInvV)
 163 {
 164     RealType rInv6 = rInvV * rInvV;
 165     return (rInv6 * rInv6 * rInv6);
 166 }
 167
 168 template<class RealType>
 169 static inline RealType calculateVdw6(const RealType c6, const RealType rInv6)
 170 {
 171     return (c6 * rInv6);
 172 }
 173
 174 template<class RealType>
 175 static inline RealType calculateVdw12(const RealType c12, const RealType rInv6)
 176 {
 177     return (c12 * rInv6 * rInv6);
 178 }
 179
 180 /* reaction-field electrostatics */
 181 template<class RealType>
 182 static inline RealType reactionFieldScalarForce(const RealType qq,
 183                                                 const RealType rInv,
 184                                                 const RealType r,
 185                                                 const real     krf,
 186                                                 const real     two)
 187 {
 188     return (qq * (rInv - two * krf * r * r));
 189 }
 190 template<class RealType>
 191 static inline RealType reactionFieldPotential(const RealType qq,
 192                                               const RealType rInv,
 193                                               const RealType r,
 194                                               const real     krf,
 195                                               const real     potentialShift)
 196 {
 197     return (qq * (rInv + krf * r * r - potentialShift));
 198 }
 199
 200 /* Ewald electrostatics */
 201 template<class RealType>
 202 static inline RealType ewaldScalarForce(const RealType coulomb, const RealType rInv)
 203 {
 204     return (coulomb * rInv);
 205 }
 206 template<class RealType>
 207 static inline RealType ewaldPotential(const RealType coulomb, const RealType rInv, const real potentialShift)
 208 {
 209     return (coulomb * (rInv - potentialShift));
 210 }
 211
 212 /* cutoff LJ */
 213 template<class RealType>
 214 static inline RealType lennardJonesScalarForce(const RealType v6, const RealType v12)
 215 {
 216     return (v12 - v6);
 217 }
 218 template<class RealType>
 219 static inline RealType lennardJonesPotential(const RealType v6,
 220                                              const RealType v12,
 221                                              const RealType c6,
 222                                              const RealType c12,
 223                                              const real     repulsionShift,
 224                                              const real     dispersionShift,
 225                                              const real     oneSixth,
 226                                              const real     oneTwelfth)
 227 {
 228     return ((v12 + c12 * repulsionShift) * oneTwelfth - (v6 + c6 * dispersionShift) * oneSixth);
 229 }
 230
 231 /* Ewald LJ */
 232 template<class RealType>
 233 static inline RealType ewaldLennardJonesGridSubtract(const RealType c6grid,
 234                                                      const real     potentialShift,
 235                                                      const real     oneSixth)
 236 {
 237     return (c6grid * potentialShift * oneSixth);
 238 }
 239
 240 /* LJ Potential switch */
 241 template<class RealType, class BoolType>
 242 static inline RealType potSwitchScalarForceMod(const RealType fScalarInp,
 243                                                const RealType potential,
 244                                                const RealType sw,
 245                                                const RealType r,
 246                                                const RealType dsw,
 247                                                const BoolType mask)
 248 {
 249     /* The mask should select on rV < rVdw */
 250     return (gmx::selectByMask(fScalarInp * sw - r * potential * dsw, mask));
 251 }
 252 template<class RealType, class BoolType>
 253 static inline RealType potSwitchPotentialMod(const RealType potentialInp, const RealType sw, const BoolType mask)
 254 {
 255     /* The mask should select on rV < rVdw */
 256     return (gmx::selectByMask(potentialInp * sw, mask));
 257 }
 258
 259
 260 //! Templated free-energy non-bonded kernel
 261 template<typename DataTypes, KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
 262 static void nb_free_energy_kernel(const t_nblist&                                  nlist,
 263                                   const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
 264                                   const int                                        ntype,
 265                                   const real                                       rlist,
 266                                   const interaction_const_t&                       ic,
 267                                   gmx::ArrayRef<const gmx::RVec>                   shiftvec,
 268                                   gmx::ArrayRef<const real>                        nbfp,
 269                                   gmx::ArrayRef<const real> gmx_unused             nbfp_grid,
 270                                   gmx::ArrayRef<const real>                        chargeA,
 271                                   gmx::ArrayRef<const real>                        chargeB,
 272                                   gmx::ArrayRef<const int>                         typeA,
 273                                   gmx::ArrayRef<const int>                         typeB,
 274                                   int                                              flags,
 275                                   gmx::ArrayRef<const real>                        lambda,
 276                                   t_nrnb* gmx_restrict                             nrnb,
 277                                   gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
 278                                   rvec gmx_unused*    threadForceShiftBuffer,
 279                                   gmx::ArrayRef<real> threadVc,
 280                                   gmx::ArrayRef<real> threadVv,
 281                                   gmx::ArrayRef<real> threadDvdl)
 282 {
 283 #define STATE_A 0
 284 #define STATE_B 1
 285 #define NSTATES 2
 286
 287     using RealType = typename DataTypes::RealType;
 288     using IntType  = typename DataTypes::IntType;
 289     using BoolType = typename DataTypes::BoolType;
 290
 291     constexpr real oneTwelfth = 1.0_real / 12.0_real;
 292     constexpr real oneSixth   = 1.0_real / 6.0_real;
 293     constexpr real zero       = 0.0_real;
 294     constexpr real half       = 0.5_real;
 295     constexpr real one        = 1.0_real;
 296     constexpr real two        = 2.0_real;
 297     constexpr real six        = 6.0_real;
 298
 299     // Extract pair list data
 300     const int                nri    = nlist.nri;
 301     gmx::ArrayRef<const int> iinr   = nlist.iinr;
 302     gmx::ArrayRef<const int> jindex = nlist.jindex;
 303     gmx::ArrayRef<const int> jjnr   = nlist.jjnr;
 304     gmx::ArrayRef<const int> shift  = nlist.shift;
 305     gmx::ArrayRef<const int> gid    = nlist.gid;
 306
 307     const real lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
 308     const real lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
 309
 310     // Extract softcore parameters
 311     const auto&           scParams   = *ic.softCoreParameters;
 312     const real            lam_power  = scParams.lambdaPower;
 313     const real gmx_unused alpha_coul = scParams.alphaCoulomb;
 314     const real gmx_unused alpha_vdw  = scParams.alphaVdw;
 315     const real gmx_unused sigma6_def = scParams.sigma6WithInvalidSigma;
 316     const real gmx_unused sigma6_min = scParams.sigma6Minimum;
 317
 318     const real gmx_unused gapsysScaleLinpointCoul = scParams.gapsysScaleLinpointCoul;
 319     const real gmx_unused gapsysScaleLinpointVdW  = scParams.gapsysScaleLinpointVdW;
 320     const real gmx_unused sigma6VdWGapsys         = scParams.sigma6VdWGapsys;
 321
 322     const bool gmx_unused doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
 323     const bool            doPotential   = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
 324
 325     // Extract data from interaction_const_t
 326     const real            facel           = ic.epsfac;
 327     const real            rCoulomb        = ic.rcoulomb;
 328     const real            krf             = ic.reactionFieldCoefficient;
 329     const real            crf             = ic.reactionFieldShift;
 330     const real gmx_unused shLjEwald       = ic.sh_lj_ewald;
 331     const real            rVdw            = ic.rvdw;
 332     const real            dispersionShift = ic.dispersion_shift.cpot;
 333     const real            repulsionShift  = ic.repulsion_shift.cpot;
 334     const real            ewaldBeta       = ic.ewaldcoeff_q;
 335     real gmx_unused       ewaldLJCoeffSq;
 336     real gmx_unused       ewaldLJCoeffSixDivSix;
 337     if constexpr (vdwInteractionTypeIsEwald)
 338     {
 339         ewaldLJCoeffSq        = ic.ewaldcoeff_lj * ic.ewaldcoeff_lj;
 340         ewaldLJCoeffSixDivSix = ewaldLJCoeffSq * ewaldLJCoeffSq * ewaldLJCoeffSq / six;
 341     }
 342
 343     // Note that the nbnxm kernels do not support Coulomb potential switching at all
 344     GMX_ASSERT(ic.coulomb_modifier != InteractionModifiers::PotSwitch,
 345                "Potential switching is not supported for Coulomb with FEP");
 346
 347     const real      rVdwSwitch = ic.rvdw_switch;
 348     real gmx_unused vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
 349     if constexpr (vdwModifierIsPotSwitch)
 350     {
 351         const real d = rVdw - rVdwSwitch;
 352         vdw_swV3     = -10.0_real / (d * d * d);
 353         vdw_swV4     = 15.0_real / (d * d * d * d);
 354         vdw_swV5     = -6.0_real / (d * d * d * d * d);
 355         vdw_swF2     = -30.0_real / (d * d * d);
 356         vdw_swF3     = 60.0_real / (d * d * d * d);
 357         vdw_swF4     = -30.0_real / (d * d * d * d * d);
 358     }
 359     else
 360     {
 361         /* Avoid warnings from stupid compilers (looking at you, Clang!) */
 362         vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = zero;
 363     }
 364
 365     NbkernelElecType icoul;
 366     if (ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype))
 367     {
 368         icoul = NbkernelElecType::ReactionField;
 369     }
 370     else
 371     {
 372         icoul = NbkernelElecType::None;
 373     }
 374
 375     real rcutoff_max2                 = std::max(ic.rcoulomb, ic.rvdw);
 376     rcutoff_max2                      = rcutoff_max2 * rcutoff_max2;
 377     const real gmx_unused rCutoffCoul = ic.rcoulomb;
 378
 379     real gmx_unused sh_ewald = 0;
 380     if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
 381     {
 382         sh_ewald = ic.sh_ewald;
 383     }
 384
 385     /* For Ewald/PME interactions we cannot easily apply the soft-core component to
 386      * reciprocal space. When we use non-switched Ewald interactions, we
 387      * assume the soft-coring does not significantly affect the grid contribution
 388      * and apply the soft-core only to the full 1/r (- shift) pair contribution.
 389      *
 390      * However, we cannot use this approach for switch-modified since we would then
 391      * effectively end up evaluating a significantly different interaction here compared to the
 392      * normal (non-free-energy) kernels, either by applying a cutoff at a different
 393      * position than what the user requested, or by switching different
 394      * things (1/r rather than short-range Ewald). For these settings, we just
 395      * use the traditional short-range Ewald interaction in that case.
 396      */
 397     GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
 398                        "Can not apply soft-core to switched Ewald potentials");
 399
 400     const RealType            minDistanceSquared(c_minDistanceSquared);
 401     const RealType            maxRInvSix(c_maxRInvSix);
 402     const RealType gmx_unused floatMin(GMX_FLOAT_MIN);
 403
 404     RealType dvdlCoul(zero);
 405     RealType dvdlVdw(zero);
 406
 407     /* Lambda factor for state A, 1-lambda*/
 408     real LFC[NSTATES], LFV[NSTATES];
 409     LFC[STATE_A] = one - lambda_coul;
 410     LFV[STATE_A] = one - lambda_vdw;
 411
 412     /* Lambda factor for state B, lambda*/
 413     LFC[STATE_B] = lambda_coul;
 414     LFV[STATE_B] = lambda_vdw;
 415
 416     /*derivative of the lambda factor for state A and B */
 417     real DLF[NSTATES];
 418     DLF[STATE_A] = -one;
 419     DLF[STATE_B] = one;
 420
 421     real gmx_unused lFacCoul[NSTATES], dlFacCoul[NSTATES], lFacVdw[NSTATES], dlFacVdw[NSTATES];
 422     constexpr real  sc_r_power = six;
 423     for (int i = 0; i < NSTATES; i++)
 424     {
 425         lFacCoul[i]  = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i]));
 426         dlFacCoul[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFC[i]) : 1);
 427         lFacVdw[i]   = (lam_power == 2 ? (1 - LFV[i]) * (1 - LFV[i]) : (1 - LFV[i]));
 428         dlFacVdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
 429     }
 430
 431     // We need pointers to real for SIMD access
 432     const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
 433     real* gmx_restrict       forceRealPtr;
 434     if constexpr (computeForces)
 435     {
 436         GMX_ASSERT(nri == 0 || !threadForceBuffer.empty(), "need a valid threadForceBuffer");
 437
 438         forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
 439
 440         if (doShiftForces)
 441         {
 442             GMX_ASSERT(threadForceShiftBuffer != nullptr, "need a valid threadForceShiftBuffer");
 443         }
 444     }
 445
 446     const real rlistSquared = gmx::square(rlist);
 447
 448     bool haveExcludedPairsBeyondRlist = false;
 449
 450     for (int n = 0; n < nri; n++)
 451     {
 452         bool havePairsWithinCutoff = false;
 453
 454         const int  is   = shift[n];
 455         const real shX  = shiftvec[is][XX];
 456         const real shY  = shiftvec[is][YY];
 457         const real shZ  = shiftvec[is][ZZ];
 458         const int  nj0  = jindex[n];
 459         const int  nj1  = jindex[n + 1];
 460         const int  ii   = iinr[n];
 461         const int  ii3  = 3 * ii;
 462         const real ix   = shX + x[ii3 + 0];
 463         const real iy   = shY + x[ii3 + 1];
 464         const real iz   = shZ + x[ii3 + 2];
 465         const real iqA  = facel * chargeA[ii];
 466         const real iqB  = facel * chargeB[ii];
 467         const int  ntiA = ntype * typeA[ii];
 468         const int  ntiB = ntype * typeB[ii];
 469         RealType   vCTot(0);
 470         RealType   vVTot(0);
 471         RealType   fIX(0);
 472         RealType   fIY(0);
 473         RealType   fIZ(0);
 474
 475 #if GMX_SIMD_HAVE_REAL
 476         alignas(GMX_SIMD_ALIGNMENT) int            preloadIi[DataTypes::simdRealWidth];
 477         alignas(GMX_SIMD_ALIGNMENT) int gmx_unused preloadIs[DataTypes::simdRealWidth];
 478 #else
 479         int            preloadIi[DataTypes::simdRealWidth];
 480         int gmx_unused preloadIs[DataTypes::simdRealWidth];
 481 #endif
 482         for (int s = 0; s < DataTypes::simdRealWidth; s++)
 483         {
 484             preloadIi[s] = ii;
 485             preloadIs[s] = shift[n];
 486         }
 487         IntType ii_s = gmx::load<IntType>(preloadIi);
 488
 489         for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
 490         {
 491             RealType r, rInv;
 492
 493 #if GMX_SIMD_HAVE_REAL
 494             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIsValid[DataTypes::simdRealWidth];
 495             alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIncluded[DataTypes::simdRealWidth];
 496             alignas(GMX_SIMD_ALIGNMENT) int32_t preloadJnr[DataTypes::simdRealWidth];
 497             alignas(GMX_SIMD_ALIGNMENT) int32_t typeIndices[NSTATES][DataTypes::simdRealWidth];
 498             alignas(GMX_SIMD_ALIGNMENT) real    preloadQq[NSTATES][DataTypes::simdRealWidth];
 499             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
 500             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
 501             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
 502             alignas(GMX_SIMD_ALIGNMENT)
 503                     real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
 504             alignas(GMX_SIMD_ALIGNMENT)
 505                     real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
 506             alignas(GMX_SIMD_ALIGNMENT)
 507                     real gmx_unused preloadSigma6VdWGapsys[NSTATES][DataTypes::simdRealWidth];
 508             alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
 509 #else
 510             real            preloadPairIsValid[DataTypes::simdRealWidth];
 511             real            preloadPairIncluded[DataTypes::simdRealWidth];
 512             int             preloadJnr[DataTypes::simdRealWidth];
 513             int             typeIndices[NSTATES][DataTypes::simdRealWidth];
 514             real            preloadQq[NSTATES][DataTypes::simdRealWidth];
 515             real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
 516             real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
 517             real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
 518             real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
 519             real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
 520             real gmx_unused preloadSigma6VdWGapsys[NSTATES][DataTypes::simdRealWidth];
 521             real            preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
 522 #endif
 523             for (int s = 0; s < DataTypes::simdRealWidth; s++)
 524             {
 525                 if (k + s < nj1)
 526                 {
 527                     preloadPairIsValid[s] = true;
 528                     /* Check if this pair on the exclusions list.*/
 529                     preloadPairIncluded[s]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + s]);
 530                     const int jnr           = jjnr[k + s];
 531                     preloadJnr[s]           = jnr;
 532                     typeIndices[STATE_A][s] = ntiA + typeA[jnr];
 533                     typeIndices[STATE_B][s] = ntiB + typeB[jnr];
 534                     preloadQq[STATE_A][s]   = iqA * chargeA[jnr];
 535                     preloadQq[STATE_B][s]   = iqB * chargeB[jnr];
 536
 537                     for (int i = 0; i < NSTATES; i++)
 538                     {
 539                         if constexpr (vdwInteractionTypeIsEwald)
 540                         {
 541                             preloadLjPmeC6Grid[i][s] = nbfp_grid[2 * typeIndices[i][s]];
 542                         }
 543                         else
 544                         {
 545                             preloadLjPmeC6Grid[i][s] = 0;
 546                         }
 547                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 548                         {
 549                             const real c6  = nbfp[2 * typeIndices[i][s]];
 550                             const real c12 = nbfp[2 * typeIndices[i][s] + 1];
 551                             if (c6 > 0 && c12 > 0)
 552                             {
 553                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
 554                                 preloadSigma6[i][s] = 0.5_real * c12 / c6;
 555                                 if (preloadSigma6[i][s]
 556                                     < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
 557                                 {
 558                                     preloadSigma6[i][s] = sigma6_min;
 559                                 }
 560                             }
 561                             else
 562                             {
 563                                 preloadSigma6[i][s] = sigma6_def;
 564                             }
 565                         }
 566                         if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 567                         {
 568                             const real c6  = nbfp[2 * typeIndices[i][s]];
 569                             const real c12 = nbfp[2 * typeIndices[i][s] + 1];
 570                             if (c6 > 0 && c12 > 0)
 571                             {
 572                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
 573                                 preloadSigma6VdWGapsys[i][s] = 0.5_real * c12 / c6;
 574                             }
 575                             else
 576                             {
 577                                 preloadSigma6VdWGapsys[i][s] = sigma6VdWGapsys;
 578                             }
 579                         }
 580                     }
 581                     if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 582                     {
 583                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
 584                         const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1];
 585                         const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1];
 586                         if (c12A > 0 && c12B > 0)
 587                         {
 588                             preloadAlphaVdwEff[s]  = 0;
 589                             preloadAlphaCoulEff[s] = 0;
 590                         }
 591                         else
 592                         {
 593                             preloadAlphaVdwEff[s]  = alpha_vdw;
 594                             preloadAlphaCoulEff[s] = alpha_coul;
 595                         }
 596                     }
 597                     if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 598                     {
 599                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
 600                         const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1];
 601                         const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1];
 602                         if (c12A > 0 && c12B > 0)
 603                         {
 604                             preloadGapsysScaleLinpointVdW[s]  = 0;
 605                             preloadGapsysScaleLinpointCoul[s] = 0;
 606                         }
 607                         else
 608                         {
 609                             preloadGapsysScaleLinpointVdW[s]  = gapsysScaleLinpointVdW;
 610                             preloadGapsysScaleLinpointCoul[s] = gapsysScaleLinpointCoul;
 611                         }
 612                     }
 613                 }
 614                 else
 615                 {
 616                     preloadJnr[s]                     = jjnr[k];
 617                     preloadPairIsValid[s]             = false;
 618                     preloadPairIncluded[s]            = false;
 619                     preloadAlphaVdwEff[s]             = 0;
 620                     preloadAlphaCoulEff[s]            = 0;
 621                     preloadGapsysScaleLinpointVdW[s]  = 0;
 622                     preloadGapsysScaleLinpointCoul[s] = 0;
 623
 624                     for (int i = 0; i < NSTATES; i++)
 625                     {
 626                         typeIndices[STATE_A][s]      = ntiA + typeA[jjnr[k]];
 627                         typeIndices[STATE_B][s]      = ntiB + typeB[jjnr[k]];
 628                         preloadLjPmeC6Grid[i][s]     = 0;
 629                         preloadQq[i][s]              = 0;
 630                         preloadSigma6[i][s]          = 0;
 631                         preloadSigma6VdWGapsys[i][s] = 0;
 632                     }
 633                 }
 634             }
 635
 636             RealType jx, jy, jz;
 637             gmx::gatherLoadUTranspose<3>(reinterpret_cast<const real*>(x), preloadJnr, &jx, &jy, &jz);
 638
 639             const RealType pairIsValid   = gmx::load<RealType>(preloadPairIsValid);
 640             const RealType pairIncluded  = gmx::load<RealType>(preloadPairIncluded);
 641             const BoolType bPairIncluded = (pairIncluded != zero);
 642             const BoolType bPairExcluded = (pairIncluded == zero && pairIsValid != zero);
 643
 644             const RealType dX  = ix - jx;
 645             const RealType dY  = iy - jy;
 646             const RealType dZ  = iz - jz;
 647             RealType       rSq = dX * dX + dY * dY + dZ * dZ;
 648
 649             BoolType withinCutoffMask = (rSq < rcutoff_max2);
 650
 651             if (!gmx::anyTrue(withinCutoffMask || bPairExcluded))
 652             {
 653                 /* We save significant time by skipping all code below.
 654                  * Note that with soft-core interactions, the actual cut-off
 655                  * check might be different. But since the soft-core distance
 656                  * is always larger than r, checking on r here is safe.
 657                  * Exclusions outside the cutoff can not be skipped as
 658                  * when using Ewald: the reciprocal-space
 659                  * Ewald component still needs to be subtracted.
 660                  */
 661                 continue;
 662             }
 663             else
 664             {
 665                 havePairsWithinCutoff = true;
 666             }
 667
 668             if (gmx::anyTrue(rlistSquared < rSq && bPairExcluded))
 669             {
 670                 haveExcludedPairsBeyondRlist = true;
 671             }
 672
 673             const IntType  jnr_s    = gmx::load<IntType>(preloadJnr);
 674             const BoolType bIiEqJnr = gmx::cvtIB2B(ii_s == jnr_s);
 675
 676             RealType            c6[NSTATES];
 677             RealType            c12[NSTATES];
 678             RealType gmx_unused sigma6[NSTATES];
 679             RealType            qq[NSTATES];
 680             RealType gmx_unused ljPmeC6Grid[NSTATES];
 681             RealType gmx_unused alphaVdwEff;
 682             RealType gmx_unused alphaCoulEff;
 683             RealType gmx_unused gapsysScaleLinpointVdWEff;
 684             RealType gmx_unused gapsysScaleLinpointCoulEff;
 685             RealType gmx_unused sigmaVdWGapsysEff[NSTATES];
 686             for (int i = 0; i < NSTATES; i++)
 687             {
 688                 gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]);
 689                 qq[i]          = gmx::load<RealType>(preloadQq[i]);
 690                 ljPmeC6Grid[i] = gmx::load<RealType>(preloadLjPmeC6Grid[i]);
 691                 if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 692                 {
 693                     sigma6[i] = gmx::load<RealType>(preloadSigma6[i]);
 694                 }
 695                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 696                 {
 697                     sigmaVdWGapsysEff[i] = gmx::load<RealType>(preloadSigma6VdWGapsys[i]);
 698                 }
 699             }
 700             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 701             {
 702                 alphaVdwEff  = gmx::load<RealType>(preloadAlphaVdwEff);
 703                 alphaCoulEff = gmx::load<RealType>(preloadAlphaCoulEff);
 704             }
 705             if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 706             {
 707                 gapsysScaleLinpointVdWEff  = gmx::load<RealType>(preloadGapsysScaleLinpointVdW);
 708                 gapsysScaleLinpointCoulEff = gmx::load<RealType>(preloadGapsysScaleLinpointCoul);
 709             }
 710
 711             // Avoid overflow of r^-12 at distances near zero
 712             rSq  = gmx::max(rSq, minDistanceSquared);
 713             rInv = gmx::invsqrt(rSq);
 714             r    = rSq * rInv;
 715
 716             RealType gmx_unused rp, rpm2;
 717             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 718             {
 719                 rpm2 = rSq * rSq;  /* r4 */
 720                 rp   = rpm2 * rSq; /* r6 */
 721             }
 722             else
 723             {
 724                 /* The soft-core power p will not affect the results
 725                  * with not using soft-core, so we use power of 0 which gives
 726                  * the simplest math and cheapest code.
 727                  */
 728                 rpm2 = rInv * rInv;
 729                 rp   = one;
 730             }
 731
 732             RealType fScal(0);
 733
 734             /* The following block is masked to only calculate values having bPairIncluded. If
 735              * bPairIncluded is true then withinCutoffMask must also be true. */
 736             if (gmx::anyTrue(withinCutoffMask && bPairIncluded))
 737             {
 738                 RealType gmx_unused fScalC[NSTATES], fScalV[NSTATES];
 739                 RealType            vCoul[NSTATES], vVdw[NSTATES];
 740                 for (int i = 0; i < NSTATES; i++)
 741                 {
 742                     fScalC[i] = zero;
 743                     fScalV[i] = zero;
 744                     vCoul[i]  = zero;
 745                     vVdw[i]   = zero;
 746
 747                     RealType gmx_unused rInvC, rInvV, rC, rV, rPInvC, rPInvV;
 748
 749                     /* The following block is masked to require (qq[i] != 0 || c6[i] != 0 || c12[i]
 750                      * != 0) in addition to bPairIncluded, which in turn requires withinCutoffMask. */
 751                     BoolType nonZeroState = ((qq[i] != zero || c6[i] != zero || c12[i] != zero)
 752                                              && bPairIncluded && withinCutoffMask);
 753                     if (gmx::anyTrue(nonZeroState))
 754                     {
 755                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 756                         {
 757                             RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
 758                             rPInvC           = gmx::inv(divisor);
 759                             sixthRoot(rPInvC, &rInvC, &rC);
 760
 761                             if constexpr (scLambdasOrAlphasDiffer)
 762                             {
 763                                 RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
 764                                 rPInvV           = gmx::inv(divisor);
 765                                 sixthRoot(rPInvV, &rInvV, &rV);
 766                             }
 767                             else
 768                             {
 769                                 /* We can avoid one expensive pow and one / operation */
 770                                 rPInvV = rPInvC;
 771                                 rInvV  = rInvC;
 772                                 rV     = rC;
 773                             }
 774                         }
 775                         else
 776                         {
 777                             rPInvC = one;
 778                             rInvC  = rInv;
 779                             rC     = r;
 780
 781                             rPInvV = one;
 782                             rInvV  = rInv;
 783                             rV     = r;
 784                         }
 785
 786                         /* Only process the coulomb interactions if we either
 787                          * include all entries in the list (no cutoff
 788                          * used in the kernel), or if we are within the cutoff.
 789                          */
 790                         BoolType computeElecInteraction;
 791                         if constexpr (elecInteractionTypeIsEwald)
 792                         {
 793                             computeElecInteraction = (r < rCoulomb && qq[i] != zero && bPairIncluded);
 794                         }
 795                         else
 796                         {
 797                             computeElecInteraction = (rC < rCoulomb && qq[i] != zero && bPairIncluded);
 798                         }
 799                         if (gmx::anyTrue(computeElecInteraction))
 800                         {
 801                             if constexpr (elecInteractionTypeIsEwald)
 802                             {
 803                                 vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald);
 804                                 if constexpr (computeForces)
 805                                 {
 806                                     fScalC[i] = ewaldScalarForce(qq[i], rInvC);
 807                                 }
 808
 809                                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 810                                 {
 811                                     ewaldQuadraticPotential(qq[i],
 812                                                             facel,
 813                                                             rC,
 814                                                             rCutoffCoul,
 815                                                             LFC[i],
 816                                                             DLF[i],
 817                                                             gapsysScaleLinpointCoulEff,
 818                                                             sh_ewald,
 819                                                             &fScalC[i],
 820                                                             &vCoul[i],
 821                                                             &dvdlCoul,
 822                                                             computeElecInteraction);
 823                                 }
 824                             }
 825                             else
 826                             {
 827                                 vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
 828                                 if constexpr (computeForces)
 829                                 {
 830                                     fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
 831                                 }
 832
 833                                 if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 834                                 {
 835                                     reactionFieldQuadraticPotential(qq[i],
 836                                                                     facel,
 837                                                                     rC,
 838                                                                     rCutoffCoul,
 839                                                                     LFC[i],
 840                                                                     DLF[i],
 841                                                                     gapsysScaleLinpointCoulEff,
 842                                                                     krf,
 843                                                                     crf,
 844                                                                     &fScalC[i],
 845                                                                     &vCoul[i],
 846                                                                     &dvdlCoul,
 847                                                                     computeElecInteraction);
 848                                 }
 849                             }
 850
 851                             vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction);
 852                             if constexpr (computeForces)
 853                             {
 854                                 fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
 855                             }
 856                         }
 857
 858                         /* Only process the VDW interactions if we either
 859                          * include all entries in the list (no cutoff used
 860                          * in the kernel), or if we are within the cutoff.
 861                          */
 862                         BoolType computeVdwInteraction;
 863                         if constexpr (vdwInteractionTypeIsEwald)
 864                         {
 865                             computeVdwInteraction =
 866                                     (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
 867                         }
 868                         else
 869                         {
 870                             computeVdwInteraction =
 871                                     (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
 872                         }
 873                         if (gmx::anyTrue(computeVdwInteraction))
 874                         {
 875                             RealType rInv6;
 876                             if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 877                             {
 878                                 rInv6 = rPInvV;
 879                             }
 880                             else
 881                             {
 882                                 rInv6 = calculateRinv6(rInvV);
 883                             }
 884                             // Avoid overflow at short distance for masked exclusions and
 885                             // for foreign energy calculations at a hard core end state.
 886                             // Note that we should limit r^-6, and thus also r^-12, and
 887                             // not only r^-12, as that could lead to erroneously low instead
 888                             // of very high foreign energies.
 889                             rInv6           = gmx::min(rInv6, maxRInvSix);
 890                             RealType vVdw6  = calculateVdw6(c6[i], rInv6);
 891                             RealType vVdw12 = calculateVdw12(c12[i], rInv6);
 892
 893                             vVdw[i] = lennardJonesPotential(
 894                                     vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth);
 895                             if constexpr (computeForces)
 896                             {
 897                                 fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
 898                             }
 899
 900                             if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
 901                             {
 902                                 lennardJonesQuadraticPotential(c6[i],
 903                                                                c12[i],
 904                                                                r,
 905                                                                rSq,
 906                                                                LFV[i],
 907                                                                DLF[i],
 908                                                                sigmaVdWGapsysEff[i],
 909                                                                gapsysScaleLinpointVdWEff,
 910                                                                repulsionShift,
 911                                                                dispersionShift,
 912                                                                &fScalV[i],
 913                                                                &vVdw[i],
 914                                                                &dvdlVdw,
 915                                                                computeVdwInteraction);
 916                             }
 917
 918                             if constexpr (vdwInteractionTypeIsEwald)
 919                             {
 920                                 /* Subtract the grid potential at the cut-off */
 921                                 vVdw[i] = vVdw[i]
 922                                           + gmx::selectByMask(ewaldLennardJonesGridSubtract(
 923                                                                       ljPmeC6Grid[i], shLjEwald, oneSixth),
 924                                                               computeVdwInteraction);
 925                             }
 926
 927                             if constexpr (vdwModifierIsPotSwitch)
 928                             {
 929                                 RealType d             = rV - rVdwSwitch;
 930                                 BoolType zeroMask      = zero < d;
 931                                 BoolType potSwitchMask = rV < rVdw;
 932                                 d                      = gmx::selectByMask(d, zeroMask);
 933                                 const RealType d2      = d * d;
 934                                 const RealType sw =
 935                                         one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
 936
 937                                 if constexpr (computeForces)
 938                                 {
 939                                     const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
 940                                     fScalV[i]          = potSwitchScalarForceMod(
 941                                             fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
 942                                 }
 943                                 vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask);
 944                             }
 945
 946                             vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction);
 947                             if constexpr (computeForces)
 948                             {
 949                                 fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
 950                             }
 951                         }
 952
 953                         if constexpr (computeForces)
 954                         {
 955                             /* fScalC (and fScalV) now contain: dV/drC * rC
 956                              * Now we multiply by rC^-6, so it will be: dV/drC * rC^-5
 957                              * Further down we first multiply by r^4 and then by
 958                              * the vector r, which in total gives: dV/drC * (r/rC)^-5
 959                              */
 960                             fScalC[i] = fScalC[i] * rPInvC;
 961                             fScalV[i] = fScalV[i] * rPInvV;
 962                         }
 963                     } // end of block requiring nonZeroState
 964                 }     // end for (int i = 0; i < NSTATES; i++)
 965
 966                 /* Assemble A and B states. */
 967                 BoolType assembleStates = (bPairIncluded && withinCutoffMask);
 968                 if (gmx::anyTrue(assembleStates))
 969                 {
 970                     for (int i = 0; i < NSTATES; i++)
 971                     {
 972                         vCTot = vCTot + LFC[i] * vCoul[i];
 973                         vVTot = vVTot + LFV[i] * vVdw[i];
 974
 975                         if constexpr (computeForces)
 976                         {
 977                             fScal = fScal + LFC[i] * fScalC[i] * rpm2;
 978                             fScal = fScal + LFV[i] * fScalV[i] * rpm2;
 979                         }
 980
 981                         if constexpr (softcoreType == KernelSoftcoreType::Beutler)
 982                         {
 983                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]
 984                                        + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i];
 985                             dvdlVdw = dvdlVdw + vVdw[i] * DLF[i]
 986                                       + LFV[i] * alphaVdwEff * dlFacVdw[i] * fScalV[i] * sigma6[i];
 987                         }
 988                         else
 989                         {
 990                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i];
 991                             dvdlVdw  = dvdlVdw + vVdw[i] * DLF[i];
 992                         }
 993                     }
 994                 }
 995             } // end of block requiring bPairIncluded && withinCutoffMask
 996             /* In the following block bPairIncluded should be false in the masks. */
 997             if (icoul == NbkernelElecType::ReactionField)
 998             {
 999                 const BoolType computeReactionField = bPairExcluded;
1000
1001                 if (gmx::anyTrue(computeReactionField))
1002                 {
1003                     /* For excluded pairs we don't use soft-core.
1004                      * As there is no singularity, there is no need for soft-core.
1005                      */
1006                     const RealType FF = -two * krf;
1007                     RealType       VV = krf * rSq - crf;
1008
1009                     /* If ii == jnr the i particle (ii) has itself (jnr)
1010                      * in its neighborlist. This corresponds to a self-interaction
1011                      * that will occur twice. Scale it down by 50% to only include
1012                      * it once.
1013                      */
1014                     VV = VV * gmx::blend(one, half, bIiEqJnr);
1015
1016                     for (int i = 0; i < NSTATES; i++)
1017                     {
1018                         vCTot = vCTot + gmx::selectByMask(LFC[i] * qq[i] * VV, computeReactionField);
1019                         fScal = fScal + gmx::selectByMask(LFC[i] * qq[i] * FF, computeReactionField);
1020                         dvdlCoul = dvdlCoul + gmx::selectByMask(DLF[i] * qq[i] * VV, computeReactionField);
1021                     }
1022                 }
1023             }
1024
1025             const BoolType computeElecEwaldInteraction = (bPairExcluded || r < rCoulomb);
1026             if (elecInteractionTypeIsEwald && gmx::anyTrue(computeElecEwaldInteraction))
1027             {
1028                 /* See comment in the preamble. When using Ewald interactions
1029                  * (unless we use a switch modifier) we subtract the reciprocal-space
1030                  * Ewald component here which made it possible to apply the free
1031                  * energy interaction to 1/r (vanilla coulomb short-range part)
1032                  * above. This gets us closer to the ideal case of applying
1033                  * the softcore to the entire electrostatic interaction,
1034                  * including the reciprocal-space component.
1035                  */
1036                 RealType v_lr, f_lr;
1037
1038                 pmeCoulombCorrectionVF<computeForces>(rSq, ewaldBeta, &v_lr, &f_lr);
1039                 if constexpr (computeForces)
1040                 {
1041                     f_lr = f_lr * rInv * rInv;
1042                 }
1043
1044                 /* Note that any possible Ewald shift has already been applied in
1045                  * the normal interaction part above.
1046                  */
1047
1048                 /* If ii == jnr the i particle (ii) has itself (jnr)
1049                  * in its neighborlist. This corresponds to a self-interaction
1050                  * that will occur twice. Scale it down by 50% to only include
1051                  * it once.
1052                  */
1053                 v_lr = v_lr * gmx::blend(one, half, bIiEqJnr);
1054
1055                 for (int i = 0; i < NSTATES; i++)
1056                 {
1057                     vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1058                     if constexpr (computeForces)
1059                     {
1060                         fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
1061                     }
1062                     dvdlCoul = dvdlCoul
1063                                - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction);
1064                 }
1065             }
1066
1067             const BoolType computeVdwEwaldInteraction = (bPairExcluded || r < rVdw);
1068             if (vdwInteractionTypeIsEwald && gmx::anyTrue(computeVdwEwaldInteraction))
1069             {
1070                 /* See comment in the preamble. When using LJ-Ewald interactions
1071                  * (unless we use a switch modifier) we subtract the reciprocal-space
1072                  * Ewald component here which made it possible to apply the free
1073                  * energy interaction to r^-6 (vanilla LJ6 short-range part)
1074                  * above. This gets us closer to the ideal case of applying
1075                  * the softcore to the entire VdW interaction,
1076                  * including the reciprocal-space component.
1077                  */
1078
1079                 RealType v_lr, f_lr;
1080                 pmeLJCorrectionVF<computeForces>(
1081                         rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr);
1082                 v_lr = v_lr * oneSixth;
1083
1084                 for (int i = 0; i < NSTATES; i++)
1085                 {
1086                     vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1087                     if constexpr (computeForces)
1088                     {
1089                         fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
1090                     }
1091                     dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
1092                 }
1093             }
1094
1095             if (computeForces && gmx::anyTrue(fScal != zero))
1096             {
1097                 const RealType tX = fScal * dX;
1098                 const RealType tY = fScal * dY;
1099                 const RealType tZ = fScal * dZ;
1100                 fIX               = fIX + tX;
1101                 fIY               = fIY + tY;
1102                 fIZ               = fIZ + tZ;
1103
1104                 gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
1105             }
1106         } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
1107
1108         if (havePairsWithinCutoff)
1109         {
1110             if constexpr (computeForces)
1111             {
1112                 gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
1113
1114                 if (doShiftForces)
1115                 {
1116                     gmx::transposeScatterIncrU<3>(
1117                             reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
1118                 }
1119             }
1120             if (doPotential)
1121             {
1122                 int ggid = gid[n];
1123                 threadVc[ggid] += gmx::reduce(vCTot);
1124                 threadVv[ggid] += gmx::reduce(vVTot);
1125             }
1126         }
1127     } // end for (int n = 0; n < nri; n++)
1128
1129     if (gmx::anyTrue(dvdlCoul != zero))
1130     {
1131         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)] += gmx::reduce(dvdlCoul);
1132     }
1133     if (gmx::anyTrue(dvdlVdw != zero))
1134     {
1135         threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)] += gmx::reduce(dvdlVdw);
1136     }
1137
1138     /* Estimate flops, average for free energy stuff:
1139      * 12  flops per outer iteration
1140      * 150 flops per inner iteration
1141      * TODO: Update the number of flops and/or use different counts for different code paths.
1142      */
1143     atomicNrnbIncrement(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist.nri * 12 + nlist.jindex[nri] * 150);
1144
1145     if (haveExcludedPairsBeyondRlist > 0)
1146     {
1147         gmx_fatal(FARGS,
1148                   "There are perturbed non-bonded pair interactions beyond the pair-list cutoff "
1149                   "of %g nm, which is not supported. This can happen because the system is "
1150                   "unstable or because intra-molecular interactions at long distances are "
1151                   "excluded. If the "
1152                   "latter is the case, you can try to increase nstlist or rlist to avoid this."
1153                   "The error is likely triggered by the use of couple-intramol=no "
1154                   "and the maximal distance in the decoupled molecule exceeding rlist.",
1155                   rlist);
1156     }
1157 }
1158
1159 typedef void (*KernelFunction)(const t_nblist&                                  nlist,
1160                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1161                                const int                                        ntype,
1162                                const real                                       rlist,
1163                                const interaction_const_t&                       ic,
1164                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1165                                gmx::ArrayRef<const real>                        nbfp,
1166                                gmx::ArrayRef<const real>                        nbfp_grid,
1167                                gmx::ArrayRef<const real>                        chargeA,
1168                                gmx::ArrayRef<const real>                        chargeB,
1169                                gmx::ArrayRef<const int>                         typeA,
1170                                gmx::ArrayRef<const int>                         typeB,
1171                                int                                              flags,
1172                                gmx::ArrayRef<const real>                        lambda,
1173                                t_nrnb* gmx_restrict                             nrnb,
1174                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1175                                rvec*               threadForceShiftBuffer,
1176                                gmx::ArrayRef<real> threadVc,
1177                                gmx::ArrayRef<real> threadVv,
1178                                gmx::ArrayRef<real> threadDvdl);
1179
1180 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
1181 static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
1182 {
1183     if (useSimd)
1184     {
1185 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
1186         return (nb_free_energy_kernel<SimdDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1187 #else
1188         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch computeForces>);
1189 #endif
1190     }
1191     else
1192     {
1193         return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
1194     }
1195 }
1196
1197 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
1198 static KernelFunction dispatchKernelOnComputeForces(const bool computeForces, const bool useSimd)
1199 {
1200     if (computeForces)
1201     {
1202         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, true>(
1203                 useSimd));
1204     }
1205     else
1206     {
1207         return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, false>(
1208                 useSimd));
1209     }
1210 }
1211
1212 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
1213 static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch,
1214                                                   const bool computeForces,
1215                                                   const bool useSimd)
1216 {
1217     if (vdwModifierIsPotSwitch)
1218     {
1219         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
1220                 computeForces, useSimd));
1221     }
1222     else
1223     {
1224         return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
1225                 computeForces, useSimd));
1226     }
1227 }
1228
1229 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
1230 static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
1231                                                           const bool vdwModifierIsPotSwitch,
1232                                                           const bool computeForces,
1233                                                           const bool useSimd)
1234 {
1235     if (elecInteractionTypeIsEwald)
1236     {
1237         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
1238                 vdwModifierIsPotSwitch, computeForces, useSimd));
1239     }
1240     else
1241     {
1242         return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
1243                 vdwModifierIsPotSwitch, computeForces, useSimd));
1244     }
1245 }
1246
1247 template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer>
1248 static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
1249                                                          const bool elecInteractionTypeIsEwald,
1250                                                          const bool vdwModifierIsPotSwitch,
1251                                                          const bool computeForces,
1252                                                          const bool useSimd)
1253 {
1254     if (vdwInteractionTypeIsEwald)
1255     {
1256         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, true>(
1257                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1258     }
1259     else
1260     {
1261         return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, false>(
1262                 elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1263     }
1264 }
1265
1266 template<KernelSoftcoreType softcoreType>
1267 static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
1268                                                                   const bool vdwInteractionTypeIsEwald,
1269                                                                   const bool elecInteractionTypeIsEwald,
1270                                                                   const bool vdwModifierIsPotSwitch,
1271                                                                   const bool computeForces,
1272                                                                   const bool useSimd)
1273 {
1274     if (scLambdasOrAlphasDiffer)
1275     {
1276         return (dispatchKernelOnVdwInteractionType<softcoreType, true>(
1277                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1278     }
1279     else
1280     {
1281         return (dispatchKernelOnVdwInteractionType<softcoreType, false>(
1282                 vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
1283     }
1284 }
1285
1286 static KernelFunction dispatchKernel(const bool                 scLambdasOrAlphasDiffer,
1287                                      const bool                 vdwInteractionTypeIsEwald,
1288                                      const bool                 elecInteractionTypeIsEwald,
1289                                      const bool                 vdwModifierIsPotSwitch,
1290                                      const bool                 computeForces,
1291                                      const bool                 useSimd,
1292                                      const interaction_const_t& ic)
1293 {
1294     const auto& scParams = *ic.softCoreParameters;
1295     if (scParams.softcoreType == SoftcoreType::Beutler)
1296     {
1297         if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
1298         {
1299             return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
1300                     scLambdasOrAlphasDiffer,
1301                     vdwInteractionTypeIsEwald,
1302                     elecInteractionTypeIsEwald,
1303                     vdwModifierIsPotSwitch,
1304                     computeForces,
1305                     useSimd));
1306         }
1307         return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Beutler>(
1308                 scLambdasOrAlphasDiffer,
1309                 vdwInteractionTypeIsEwald,
1310                 elecInteractionTypeIsEwald,
1311                 vdwModifierIsPotSwitch,
1312                 computeForces,
1313                 useSimd));
1314     }
1315     else // Gapsys
1316     {
1317         if (scParams.gapsysScaleLinpointCoul == 0 && scParams.gapsysScaleLinpointVdW == 0)
1318         {
1319             return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
1320                     scLambdasOrAlphasDiffer,
1321                     vdwInteractionTypeIsEwald,
1322                     elecInteractionTypeIsEwald,
1323                     vdwModifierIsPotSwitch,
1324                     computeForces,
1325                     useSimd));
1326         }
1327         return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Gapsys>(
1328                 scLambdasOrAlphasDiffer,
1329                 vdwInteractionTypeIsEwald,
1330                 elecInteractionTypeIsEwald,
1331                 vdwModifierIsPotSwitch,
1332                 computeForces,
1333                 useSimd));
1334     }
1335 }
1336
1337
1338 void gmx_nb_free_energy_kernel(const t_nblist&                                  nlist,
1339                                const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
1340                                const bool                                       useSimd,
1341                                const int                                        ntype,
1342                                const real                                       rlist,
1343                                const interaction_const_t&                       ic,
1344                                gmx::ArrayRef<const gmx::RVec>                   shiftvec,
1345                                gmx::ArrayRef<const real>                        nbfp,
1346                                gmx::ArrayRef<const real>                        nbfp_grid,
1347                                gmx::ArrayRef<const real>                        chargeA,
1348                                gmx::ArrayRef<const real>                        chargeB,
1349                                gmx::ArrayRef<const int>                         typeA,
1350                                gmx::ArrayRef<const int>                         typeB,
1351                                int                                              flags,
1352                                gmx::ArrayRef<const real>                        lambda,
1353                                t_nrnb*                                          nrnb,
1354                                gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
1355                                rvec*               threadForceShiftBuffer,
1356                                gmx::ArrayRef<real> threadVc,
1357                                gmx::ArrayRef<real> threadVv,
1358                                gmx::ArrayRef<real> threadDvdl)
1359 {
1360     GMX_ASSERT(EEL_PME_EWALD(ic.eeltype) || ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype),
1361                "Unsupported eeltype with free energy");
1362     GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
1363
1364     // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
1365     GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
1366                        || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
1367                "We need actual padding with at least one element for SIMD scatter operations");
1368
1369     const auto& scParams                   = *ic.softCoreParameters;
1370     const bool  vdwInteractionTypeIsEwald  = (EVDW_PME(ic.vdwtype));
1371     const bool  elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
1372     const bool  vdwModifierIsPotSwitch     = (ic.vdw_modifier == InteractionModifiers::PotSwitch);
1373     const bool  computeForces              = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
1374     bool        scLambdasOrAlphasDiffer    = true;
1375
1376     if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
1377     {
1378         scLambdasOrAlphasDiffer = false;
1379     }
1380     else
1381     {
1382         if (lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
1383                     == lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]
1384             && scParams.alphaCoulomb == scParams.alphaVdw)
1385         {
1386             scLambdasOrAlphasDiffer = false;
1387         }
1388     }
1389
1390     KernelFunction kernelFunc;
1391     kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer,
1392                                 vdwInteractionTypeIsEwald,
1393                                 elecInteractionTypeIsEwald,
1394                                 vdwModifierIsPotSwitch,
1395                                 computeForces,
1396                                 useSimd,
1397                                 ic);
1398     kernelFunc(nlist,
1399                coords,
1400                ntype,
1401                rlist,
1402                ic,
1403                shiftvec,
1404                nbfp,
1405                nbfp_grid,
1406                chargeA,
1407                chargeB,
1408                typeA,
1409                typeB,
1410                flags,
1411                lambda,
1412                nrnb,
1413                threadForceBuffer,
1414                threadForceShiftBuffer,
1415                threadVc,
1416                threadVv,
1417                threadDvdl);
1418 }