Avoid comparing Simd types to real 0 when making masks in NB FEP kernel.

[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_free_energy.cpp
diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp b/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp

index ad316bd5da5020a31631387cbb416f019b398993..85f998cfc546412a2fafffbac2623f846656fe4a 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp
+++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp
@@ -63,6 +63,7 @@
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/arrayref.h"
  
+#include "nb_softcore.h"
  
  //! Scalar (non-SIMD) data types.
  struct ScalarDataTypes
@@ -90,46 +91,71 @@ struct SimdDataTypes
  };
  #endif
  
-template<class RealType, class BoolType>
+/*! \brief Lower limit for square interaction distances in nonbonded kernels.
+ *
+ * This is a mimimum on r^2 to avoid overflows when computing r^6.
+ * This will only affect results for soft-cored interaction at distances smaller
+ * than 1e-6 and will limit extremely high foreign energies for overlapping atoms.
+ * Note that we could use a somewhat smaller minimum in double precision.
+ * But because invsqrt in double precision can use single precision, this number
+ * can not be much smaller, we use the same number for simplicity.
+ */
+constexpr real c_minDistanceSquared = 1.0e-12_real;
+
+/*! \brief Higher limit for r^-6 used for Lennard-Jones interactions
+ *
+ * This is needed to avoid overflow of LJ energy and force terms for excluded
+ * atoms and foreign energies of hard-core states of overlapping atoms.
+ * Note that in single precision this value leaves room for C12 coefficients up to 3.4e8.
+ */
+constexpr real c_maxRInvSix = 1.0e15_real;
+
+template<bool computeForces, class RealType>
  static inline void
-pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType* force, const BoolType mask)
+pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType gmx_unused* force)
  {
-    const RealType brsq = gmx::selectByMask(rSq * beta * beta, mask);
-    *force              = -brsq * beta * gmx::pmeForceCorrection(brsq);
-    *pot                = beta * gmx::pmePotentialCorrection(brsq);
+    const RealType brsq = rSq * beta * beta;
+    if constexpr (computeForces)
+    {
+        *force = -brsq * beta * gmx::pmeForceCorrection(brsq);
+    }
+    *pot = beta * gmx::pmePotentialCorrection(brsq);
  }
  
-template<class RealType, class BoolType>
+template<bool computeForces, class RealType, class BoolType>
  static inline void pmeLJCorrectionVF(const RealType rInv,
                                       const RealType rSq,
                                       const real     ewaldLJCoeffSq,
                                       const real     ewaldLJCoeffSixDivSix,
                                       RealType*      pot,
-                                     RealType*      force,
-                                     const BoolType mask,
-                                     const BoolType bIiEqJnr)
+                                     RealType gmx_unused* force,
+                                     const BoolType       mask,
+                                     const BoolType       bIiEqJnr)
  {
      // We mask rInv to get zero force and potential for masked out pair interactions
-    const RealType rInvSq  = gmx::selectByMask(rInv * rInv, mask);
+    const RealType rInvSq  = rInv * rInv;
      const RealType rInvSix = rInvSq * rInvSq * rInvSq;
      // Mask rSq to avoid underflow in exp()
      const RealType coeffSqRSq       = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask);
      const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq);
      const RealType poly             = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq;
-    *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
-    *force = *force * rInvSq;
+    if constexpr (computeForces)
+    {
+        *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
+        *force = *force * rInvSq;
+    }
      // The self interaction is the limit for r -> 0 which we need to compute separately
      *pot = gmx::blend(
              rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr);
  }
  
-//! Computes r^(1/p) and 1/r^(1/p) for the standard p=6
-template<class RealType, class BoolType>
-static inline void pthRoot(const RealType r, RealType* pthRoot, RealType* invPthRoot, const BoolType mask)
+//! Computes r^(1/6) and 1/r^(1/6)
+template<class RealType>
+static inline void sixthRoot(const RealType r, RealType* sixthRoot, RealType* invSixthRoot)
  {
      RealType cbrtRes = gmx::cbrt(r);
-    *invPthRoot      = gmx::maskzInvsqrt(cbrtRes, mask);
-    *pthRoot         = gmx::maskzInv(*invPthRoot, mask);
+    *invSixthRoot    = gmx::invsqrt(cbrtRes);
+    *sixthRoot       = gmx::inv(*invSixthRoot);
  }
  
  template<class RealType>
@@ -232,7 +258,7 @@ static inline RealType potSwitchPotentialMod(const RealType potentialInp, const
  
  
  //! Templated free-energy non-bonded kernel
-template<typename DataTypes, bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
+template<typename DataTypes, KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
  static void nb_free_energy_kernel(const t_nblist&                                  nlist,
                                    const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
                                    const int                                        ntype,
@@ -248,8 +274,8 @@ static void nb_free_energy_kernel(const t_nblist&
                                    int                                              flags,
                                    gmx::ArrayRef<const real>                        lambda,
                                    t_nrnb* gmx_restrict                             nrnb,
-                                  gmx::RVec*          threadForceBuffer,
-                                  rvec*               threadForceShiftBuffer,
+                                  gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
+                                  rvec gmx_unused*    threadForceShiftBuffer,
                                    gmx::ArrayRef<real> threadVc,
                                    gmx::ArrayRef<real> threadVv,
                                    gmx::ArrayRef<real> threadDvdl)
@@ -278,16 +304,22 @@ static void nb_free_energy_kernel(const t_nblist&
      gmx::ArrayRef<const int> shift  = nlist.shift;
      gmx::ArrayRef<const int> gid    = nlist.gid;
  
-    const real  lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
-    const real  lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
-    const auto& scParams    = *ic.softCoreParameters;
-    const real gmx_unused alpha_coul    = scParams.alphaCoulomb;
-    const real gmx_unused alpha_vdw     = scParams.alphaVdw;
-    const real            lam_power     = scParams.lambdaPower;
-    const real gmx_unused sigma6_def    = scParams.sigma6WithInvalidSigma;
-    const real gmx_unused sigma6_min    = scParams.sigma6Minimum;
-    const bool            doForces      = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
-    const bool            doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
+    const real lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
+    const real lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
+
+    // Extract softcore parameters
+    const auto&           scParams   = *ic.softCoreParameters;
+    const real            lam_power  = scParams.lambdaPower;
+    const real gmx_unused alpha_coul = scParams.alphaCoulomb;
+    const real gmx_unused alpha_vdw  = scParams.alphaVdw;
+    const real gmx_unused sigma6_def = scParams.sigma6WithInvalidSigma;
+    const real gmx_unused sigma6_min = scParams.sigma6Minimum;
+
+    const real gmx_unused gapsysScaleLinpointCoul = scParams.gapsysScaleLinpointCoul;
+    const real gmx_unused gapsysScaleLinpointVdW  = scParams.gapsysScaleLinpointVdW;
+    const real gmx_unused gapsysSigma6VdW         = scParams.gapsysSigma6VdW;
+
+    const bool gmx_unused doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
      const bool            doPotential   = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
  
      // Extract data from interaction_const_t
@@ -340,8 +372,9 @@ static void nb_free_energy_kernel(const t_nblist&
          icoul = NbkernelElecType::None;
      }
  
-    real rcutoff_max2 = std::max(ic.rcoulomb, ic.rvdw);
-    rcutoff_max2      = rcutoff_max2 * rcutoff_max2;
+    real rcutoff_max2                 = std::max(ic.rcoulomb, ic.rvdw);
+    rcutoff_max2                      = rcutoff_max2 * rcutoff_max2;
+    const real gmx_unused rCutoffCoul = ic.rcoulomb;
  
      real gmx_unused sh_ewald = 0;
      if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
@@ -364,8 +397,12 @@ static void nb_free_energy_kernel(const t_nblist&
      GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
                         "Can not apply soft-core to switched Ewald potentials");
  
-    RealType dvdlCoul(zero);
-    RealType dvdlVdw(zero);
+    const RealType            minDistanceSquared(c_minDistanceSquared);
+    const RealType            maxRInvSix(c_maxRInvSix);
+    const RealType gmx_unused floatMin(GMX_FLOAT_MIN);
+
+    RealType dvdlCoul{ zero };
+    RealType dvdlVdw{ zero };
  
      /* Lambda factor for state A, 1-lambda*/
      real LFC[NSTATES], LFV[NSTATES];
@@ -391,8 +428,20 @@ static void nb_free_energy_kernel(const t_nblist&
          dlFacVdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
      }
  
-    // TODO: We should get rid of using pointers to real
+    // We need pointers to real for SIMD access
      const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
+    real* gmx_restrict       forceRealPtr;
+    if constexpr (computeForces)
+    {
+        GMX_ASSERT(nri == 0 || !threadForceBuffer.empty(), "need a valid threadForceBuffer");
+
+        forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
+
+        if (doShiftForces)
+        {
+            GMX_ASSERT(threadForceShiftBuffer != nullptr, "need a valid threadForceShiftBuffer");
+        }
+    }
  
      const real rlistSquared = gmx::square(rlist);
  
@@ -424,16 +473,16 @@ static void nb_free_energy_kernel(const t_nblist&
          RealType   fIZ(0);
  
  #if GMX_SIMD_HAVE_REAL
-        alignas(GMX_SIMD_ALIGNMENT) int preloadIi[DataTypes::simdRealWidth];
-        alignas(GMX_SIMD_ALIGNMENT) int preloadIs[DataTypes::simdRealWidth];
+        alignas(GMX_SIMD_ALIGNMENT) int            preloadIi[DataTypes::simdRealWidth];
+        alignas(GMX_SIMD_ALIGNMENT) int gmx_unused preloadIs[DataTypes::simdRealWidth];
  #else
-        int preloadIi[DataTypes::simdRealWidth];
-        int preloadIs[DataTypes::simdRealWidth];
+        int            preloadIi[DataTypes::simdRealWidth];
+        int gmx_unused preloadIs[DataTypes::simdRealWidth];
  #endif
-        for (int s = 0; s < DataTypes::simdRealWidth; s++)
+        for (int i = 0; i < DataTypes::simdRealWidth; i++)
          {
-            preloadIi[s] = ii;
-            preloadIs[s] = shift[n];
+            preloadIi[i] = ii;
+            preloadIs[i] = shift[n];
          }
          IntType ii_s = gmx::load<IntType>(preloadIi);
  
@@ -450,6 +499,12 @@ static void nb_free_energy_kernel(const t_nblist&
              alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
              alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
              alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysSigma6VdW[NSTATES][DataTypes::simdRealWidth];
              alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
  #else
              real            preloadPairIsValid[DataTypes::simdRealWidth];
@@ -460,84 +515,120 @@ static void nb_free_energy_kernel(const t_nblist&
              real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
              real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
              real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysSigma6VdW[NSTATES][DataTypes::simdRealWidth];
              real            preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
  #endif
-            for (int s = 0; s < DataTypes::simdRealWidth; s++)
+            for (int j = 0; j < DataTypes::simdRealWidth; j++)
              {
-                if (k + s < nj1)
+                if (k + j < nj1)
                  {
-                    preloadPairIsValid[s] = true;
+                    preloadPairIsValid[j] = true;
                      /* Check if this pair on the exclusions list.*/
-                    preloadPairIncluded[s]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + s]);
-                    const int jnr           = jjnr[k + s];
-                    preloadJnr[s]           = jnr;
-                    typeIndices[STATE_A][s] = ntiA + typeA[jnr];
-                    typeIndices[STATE_B][s] = ntiB + typeB[jnr];
-                    preloadQq[STATE_A][s]   = iqA * chargeA[jnr];
-                    preloadQq[STATE_B][s]   = iqB * chargeB[jnr];
+                    preloadPairIncluded[j]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + j]);
+                    const int jnr           = jjnr[k + j];
+                    preloadJnr[j]           = jnr;
+                    typeIndices[STATE_A][j] = ntiA + typeA[jnr];
+                    typeIndices[STATE_B][j] = ntiB + typeB[jnr];
+                    preloadQq[STATE_A][j]   = iqA * chargeA[jnr];
+                    preloadQq[STATE_B][j]   = iqB * chargeB[jnr];
  
                      for (int i = 0; i < NSTATES; i++)
                      {
                          if constexpr (vdwInteractionTypeIsEwald)
                          {
-                            preloadLjPmeC6Grid[i][s] = nbfp_grid[2 * typeIndices[i][s]];
+                            preloadLjPmeC6Grid[i][j] = nbfp_grid[2 * typeIndices[i][j]];
                          }
                          else
                          {
-                            preloadLjPmeC6Grid[i][s] = 0;
+                            preloadLjPmeC6Grid[i][j] = 0;
                          }
-                        if constexpr (useSoftCore)
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                          {
-                            const real c6  = nbfp[2 * typeIndices[i][s]];
-                            const real c12 = nbfp[2 * typeIndices[i][s] + 1];
+                            const real c6  = nbfp[2 * typeIndices[i][j]];
+                            const real c12 = nbfp[2 * typeIndices[i][j] + 1];
                              if (c6 > 0 && c12 > 0)
                              {
                                  /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
-                                preloadSigma6[i][s] = 0.5_real * c12 / c6;
-                                if (preloadSigma6[i][s]
+                                preloadSigma6[i][j] = 0.5_real * c12 / c6;
+                                if (preloadSigma6[i][j]
                                      < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
                                  {
-                                    preloadSigma6[i][s] = sigma6_min;
+                                    preloadSigma6[i][j] = sigma6_min;
                                  }
                              }
                              else
                              {
-                                preloadSigma6[i][s] = sigma6_def;
+                                preloadSigma6[i][j] = sigma6_def;
+                            }
+                        }
+                        if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                        {
+                            const real c6  = nbfp[2 * typeIndices[i][j]];
+                            const real c12 = nbfp[2 * typeIndices[i][j] + 1];
+                            if (c6 > 0 && c12 > 0)
+                            {
+                                /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
+                                preloadGapsysSigma6VdW[i][j] = 0.5_real * c12 / c6;
+                            }
+                            else
+                            {
+                                preloadGapsysSigma6VdW[i][j] = gapsysSigma6VdW;
                              }
                          }
                      }
-                    if constexpr (useSoftCore)
+                    if constexpr (softcoreType == KernelSoftcoreType::Beutler)
+                    {
+                        /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
+                        const real c12A = nbfp[2 * typeIndices[STATE_A][j] + 1];
+                        const real c12B = nbfp[2 * typeIndices[STATE_B][j] + 1];
+                        if (c12A > 0 && c12B > 0)
+                        {
+                            preloadAlphaVdwEff[j]  = 0;
+                            preloadAlphaCoulEff[j] = 0;
+                        }
+                        else
+                        {
+                            preloadAlphaVdwEff[j]  = alpha_vdw;
+                            preloadAlphaCoulEff[j] = alpha_coul;
+                        }
+                    }
+                    if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
                      {
                          /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
-                        const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1];
-                        const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1];
+                        const real c12A = nbfp[2 * typeIndices[STATE_A][j] + 1];
+                        const real c12B = nbfp[2 * typeIndices[STATE_B][j] + 1];
                          if (c12A > 0 && c12B > 0)
                          {
-                            preloadAlphaVdwEff[s]  = 0;
-                            preloadAlphaCoulEff[s] = 0;
+                            preloadGapsysScaleLinpointVdW[j]  = 0;
+                            preloadGapsysScaleLinpointCoul[j] = 0;
                          }
                          else
                          {
-                            preloadAlphaVdwEff[s]  = alpha_vdw;
-                            preloadAlphaCoulEff[s] = alpha_coul;
+                            preloadGapsysScaleLinpointVdW[j]  = gapsysScaleLinpointVdW;
+                            preloadGapsysScaleLinpointCoul[j] = gapsysScaleLinpointCoul;
                          }
                      }
                  }
                  else
                  {
-                    preloadJnr[s]          = jjnr[k];
-                    preloadPairIsValid[s]  = false;
-                    preloadPairIncluded[s] = false;
-                    preloadAlphaVdwEff[s]  = 0;
-                    preloadAlphaCoulEff[s] = 0;
-
+                    preloadJnr[j]                     = jjnr[k];
+                    preloadPairIsValid[j]             = false;
+                    preloadPairIncluded[j]            = false;
+                    preloadAlphaVdwEff[j]             = 0;
+                    preloadAlphaCoulEff[j]            = 0;
+                    preloadGapsysScaleLinpointVdW[j]  = 0;
+                    preloadGapsysScaleLinpointCoul[j] = 0;
+
+                    typeIndices[STATE_A][j] = ntiA + typeA[jjnr[k]];
+                    typeIndices[STATE_B][j] = ntiB + typeB[jjnr[k]];
                      for (int i = 0; i < NSTATES; i++)
                      {
-                        typeIndices[STATE_A][s]  = ntiA + typeA[jjnr[k]];
-                        typeIndices[STATE_B][s]  = ntiB + typeB[jjnr[k]];
-                        preloadLjPmeC6Grid[i][s] = 0;
-                        preloadQq[i][s]          = 0;
-                        preloadSigma6[i][s]      = 0;
+                        preloadLjPmeC6Grid[i][j]     = 0;
+                        preloadQq[i][j]              = 0;
+                        preloadSigma6[i][j]          = 0;
+                        preloadGapsysSigma6VdW[i][j] = 0;
                      }
                  }
              }
@@ -553,7 +644,7 @@ static void nb_free_energy_kernel(const t_nblist&
              const RealType dX  = ix - jx;
              const RealType dY  = iy - jy;
              const RealType dZ  = iz - jz;
-            const RealType rSq = dX * dX + dY * dY + dZ * dZ;
+            RealType       rSq = dX * dX + dY * dY + dZ * dZ;
  
              BoolType withinCutoffMask = (rSq < rcutoff_max2);
  
@@ -589,33 +680,41 @@ static void nb_free_energy_kernel(const t_nblist&
              RealType gmx_unused ljPmeC6Grid[NSTATES];
              RealType gmx_unused alphaVdwEff;
              RealType gmx_unused alphaCoulEff;
+            RealType gmx_unused gapsysScaleLinpointVdWEff;
+            RealType gmx_unused gapsysScaleLinpointCoulEff;
+            RealType gmx_unused gapsysSigma6VdWEff[NSTATES];
              for (int i = 0; i < NSTATES; i++)
              {
                  gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]);
                  qq[i]          = gmx::load<RealType>(preloadQq[i]);
                  ljPmeC6Grid[i] = gmx::load<RealType>(preloadLjPmeC6Grid[i]);
-                if constexpr (useSoftCore)
+                if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                  {
                      sigma6[i] = gmx::load<RealType>(preloadSigma6[i]);
                  }
+                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                {
+                    gapsysSigma6VdWEff[i] = gmx::load<RealType>(preloadGapsysSigma6VdW[i]);
+                }
              }
-            if constexpr (useSoftCore)
+            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
              {
                  alphaVdwEff  = gmx::load<RealType>(preloadAlphaVdwEff);
                  alphaCoulEff = gmx::load<RealType>(preloadAlphaCoulEff);
              }
+            if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+            {
+                gapsysScaleLinpointVdWEff  = gmx::load<RealType>(preloadGapsysScaleLinpointVdW);
+                gapsysScaleLinpointCoulEff = gmx::load<RealType>(preloadGapsysScaleLinpointCoul);
+            }
  
-            BoolType rSqValid = (zero < rSq);
-
-            /* The force at r=0 is zero, because of symmetry.
-             * But note that the potential is in general non-zero,
-             * since the soft-cored r will be non-zero.
-             */
-            rInv = gmx::maskzInvsqrt(rSq, rSqValid);
+            // Avoid overflow of r^-12 at distances near zero
+            rSq  = gmx::max(rSq, minDistanceSquared);
+            rInv = gmx::invsqrt(rSq);
              r    = rSq * rInv;
  
              RealType gmx_unused rp, rpm2;
-            if constexpr (useSoftCore)
+            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
              {
                  rpm2 = rSq * rSq;  /* r4 */
                  rp   = rpm2 * rSq; /* r6 */
@@ -636,8 +735,8 @@ static void nb_free_energy_kernel(const t_nblist&
               * bPairIncluded is true then withinCutoffMask must also be true. */
              if (gmx::anyTrue(withinCutoffMask && bPairIncluded))
              {
-                RealType fScalC[NSTATES], fScalV[NSTATES];
-                RealType vCoul[NSTATES], vVdw[NSTATES];
+                RealType gmx_unused fScalC[NSTATES], fScalV[NSTATES];
+                RealType            vCoul[NSTATES], vVdw[NSTATES];
                  for (int i = 0; i < NSTATES; i++)
                  {
                      fScalC[i] = zero;
@@ -653,19 +752,17 @@ static void nb_free_energy_kernel(const t_nblist&
                                               && bPairIncluded && withinCutoffMask);
                      if (gmx::anyTrue(nonZeroState))
                      {
-                        if constexpr (useSoftCore)
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                          {
-                            RealType divisor      = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
-                            BoolType validDivisor = (zero < divisor);
-                            rPInvC                = gmx::maskzInv(divisor, validDivisor);
-                            pthRoot(rPInvC, &rInvC, &rC, validDivisor);
+                            RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
+                            rPInvC           = gmx::inv(divisor);
+                            sixthRoot(rPInvC, &rInvC, &rC);
  
                              if constexpr (scLambdasOrAlphasDiffer)
                              {
-                                RealType divisor      = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
-                                BoolType validDivisor = (zero < divisor);
-                                rPInvV                = gmx::maskzInv(divisor, validDivisor);
-                                pthRoot(rPInvV, &rInvV, &rV, validDivisor);
+                                RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
+                                rPInvV           = gmx::inv(divisor);
+                                sixthRoot(rPInvV, &rInvV, &rV);
                              }
                              else
                              {
@@ -703,17 +800,60 @@ static void nb_free_energy_kernel(const t_nblist&
                          {
                              if constexpr (elecInteractionTypeIsEwald)
                              {
-                                vCoul[i]  = ewaldPotential(qq[i], rInvC, sh_ewald);
-                                fScalC[i] = ewaldScalarForce(qq[i], rInvC);
+                                vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald);
+                                if constexpr (computeForces)
+                                {
+                                    fScalC[i] = ewaldScalarForce(qq[i], rInvC);
+                                }
+
+                                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                                {
+                                    ewaldQuadraticPotential<computeForces>(qq[i],
+                                                                           facel,
+                                                                           rC,
+                                                                           rCutoffCoul,
+                                                                           LFC[i],
+                                                                           DLF[i],
+                                                                           gapsysScaleLinpointCoulEff,
+                                                                           sh_ewald,
+                                                                           &fScalC[i],
+                                                                           &vCoul[i],
+                                                                           &dvdlCoul,
+                                                                           computeElecInteraction);
+                                }
                              }
                              else
                              {
-                                vCoul[i]  = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
-                                fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
+                                vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
+                                if constexpr (computeForces)
+                                {
+                                    fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
+                                }
+
+                                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                                {
+                                    reactionFieldQuadraticPotential<computeForces>(
+                                            qq[i],
+                                            facel,
+                                            rC,
+                                            rCutoffCoul,
+                                            LFC[i],
+                                            DLF[i],
+                                            gapsysScaleLinpointCoulEff,
+                                            krf,
+                                            crf,
+                                            &fScalC[i],
+                                            &vCoul[i],
+                                            &dvdlCoul,
+                                            computeElecInteraction);
+                                }
                              }
  
-                            vCoul[i]  = gmx::selectByMask(vCoul[i], computeElecInteraction);
-                            fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
+                            vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction);
+                            if constexpr (computeForces)
+                            {
+                                fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
+                            }
                          }
  
                          /* Only process the VDW interactions if we either
@@ -724,17 +864,17 @@ static void nb_free_energy_kernel(const t_nblist&
                          if constexpr (vdwInteractionTypeIsEwald)
                          {
                              computeVdwInteraction =
-                                    (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
+                                    (r < rVdw && (c6[i] != zero || c12[i] != zero) && bPairIncluded);
                          }
                          else
                          {
                              computeVdwInteraction =
-                                    (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
+                                    (rV < rVdw && (c6[i] != zero || c12[i] != zero) && bPairIncluded);
                          }
                          if (gmx::anyTrue(computeVdwInteraction))
                          {
                              RealType rInv6;
-                            if constexpr (useSoftCore)
+                            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                              {
                                  rInv6 = rPInvV;
                              }
@@ -742,12 +882,39 @@ static void nb_free_energy_kernel(const t_nblist&
                              {
                                  rInv6 = calculateRinv6(rInvV);
                              }
+                            // Avoid overflow at short distance for masked exclusions and
+                            // for foreign energy calculations at a hard core end state.
+                            // Note that we should limit r^-6, and thus also r^-12, and
+                            // not only r^-12, as that could lead to erroneously low instead
+                            // of very high foreign energies.
+                            rInv6           = gmx::min(rInv6, maxRInvSix);
                              RealType vVdw6  = calculateVdw6(c6[i], rInv6);
                              RealType vVdw12 = calculateVdw12(c12[i], rInv6);
  
                              vVdw[i] = lennardJonesPotential(
                                      vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth);
-                            fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
+                            if constexpr (computeForces)
+                            {
+                                fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
+                            }
+
+                            if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                            {
+                                lennardJonesQuadraticPotential<computeForces>(c6[i],
+                                                                              c12[i],
+                                                                              r,
+                                                                              rSq,
+                                                                              LFV[i],
+                                                                              DLF[i],
+                                                                              gapsysSigma6VdWEff[i],
+                                                                              gapsysScaleLinpointVdWEff,
+                                                                              repulsionShift,
+                                                                              dispersionShift,
+                                                                              &fScalV[i],
+                                                                              &vVdw[i],
+                                                                              &dvdlVdw,
+                                                                              computeVdwInteraction);
+                            }
  
                              if constexpr (vdwInteractionTypeIsEwald)
                              {
@@ -767,24 +934,33 @@ static void nb_free_energy_kernel(const t_nblist&
                                  const RealType d2      = d * d;
                                  const RealType sw =
                                          one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
-                                const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
  
-                                fScalV[i] = potSwitchScalarForceMod(
-                                        fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
+                                if constexpr (computeForces)
+                                {
+                                    const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
+                                    fScalV[i]          = potSwitchScalarForceMod(
+                                            fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
+                                }
                                  vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask);
                              }
  
-                            vVdw[i]   = gmx::selectByMask(vVdw[i], computeVdwInteraction);
-                            fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
+                            vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction);
+                            if constexpr (computeForces)
+                            {
+                                fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
+                            }
                          }
  
-                        /* fScalC (and fScalV) now contain: dV/drC * rC
-                         * Now we multiply by rC^-p, so it will be: dV/drC * rC^1-p
-                         * Further down we first multiply by r^p-2 and then by
-                         * the vector r, which in total gives: dV/drC * (r/rC)^1-p
-                         */
-                        fScalC[i] = fScalC[i] * rPInvC;
-                        fScalV[i] = fScalV[i] * rPInvV;
+                        if constexpr (computeForces)
+                        {
+                            /* fScalC (and fScalV) now contain: dV/drC * rC
+                             * Now we multiply by rC^-6, so it will be: dV/drC * rC^-5
+                             * Further down we first multiply by r^4 and then by
+                             * the vector r, which in total gives: dV/drC * (r/rC)^-5
+                             */
+                            fScalC[i] = fScalC[i] * rPInvC;
+                            fScalV[i] = fScalV[i] * rPInvV;
+                        }
                      } // end of block requiring nonZeroState
                  }     // end for (int i = 0; i < NSTATES; i++)
  
@@ -797,10 +973,13 @@ static void nb_free_energy_kernel(const t_nblist&
                          vCTot = vCTot + LFC[i] * vCoul[i];
                          vVTot = vVTot + LFV[i] * vVdw[i];
  
-                        fScal = fScal + LFC[i] * fScalC[i] * rpm2;
-                        fScal = fScal + LFV[i] * fScalV[i] * rpm2;
+                        if constexpr (computeForces)
+                        {
+                            fScal = fScal + LFC[i] * fScalC[i] * rpm2;
+                            fScal = fScal + LFV[i] * fScalV[i] * rpm2;
+                        }
  
-                        if constexpr (useSoftCore)
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                          {
                              dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]
                                         + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i];
@@ -857,8 +1036,11 @@ static void nb_free_energy_kernel(const t_nblist&
                   */
                  RealType v_lr, f_lr;
  
-                pmeCoulombCorrectionVF(rSq, ewaldBeta, &v_lr, &f_lr, rSqValid);
-                f_lr = f_lr * rInv * rInv;
+                pmeCoulombCorrectionVF<computeForces>(rSq, ewaldBeta, &v_lr, &f_lr);
+                if constexpr (computeForces)
+                {
+                    f_lr = f_lr * rInv * rInv;
+                }
  
                  /* Note that any possible Ewald shift has already been applied in
                   * the normal interaction part above.
@@ -874,7 +1056,10 @@ static void nb_free_energy_kernel(const t_nblist&
                  for (int i = 0; i < NSTATES; i++)
                  {
                      vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction);
-                    fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
+                    if constexpr (computeForces)
+                    {
+                        fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
+                    }
                      dvdlCoul = dvdlCoul
                                 - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction);
                  }
@@ -893,19 +1078,22 @@ static void nb_free_energy_kernel(const t_nblist&
                   */
  
                  RealType v_lr, f_lr;
-                pmeLJCorrectionVF(
+                pmeLJCorrectionVF<computeForces>(
                          rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr);
                  v_lr = v_lr * oneSixth;
  
                  for (int i = 0; i < NSTATES; i++)
                  {
                      vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
-                    fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
+                    if constexpr (computeForces)
+                    {
+                        fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
+                    }
                      dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
                  }
              }
  
-            if (doForces && gmx::anyTrue(fScal != zero))
+            if (computeForces && gmx::anyTrue(fScal != zero))
              {
                  const RealType tX = fScal * dX;
                  const RealType tY = fScal * dY;
@@ -914,22 +1102,21 @@ static void nb_free_energy_kernel(const t_nblist&
                  fIY               = fIY + tY;
                  fIZ               = fIZ + tZ;
  
-                gmx::transposeScatterDecrU<3>(
-                        reinterpret_cast<real*>(threadForceBuffer), preloadJnr, tX, tY, tZ);
+                gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
              }
          } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
  
          if (havePairsWithinCutoff)
          {
-            if (doForces)
-            {
-                gmx::transposeScatterIncrU<3>(
-                        reinterpret_cast<real*>(threadForceBuffer), preloadIi, fIX, fIY, fIZ);
-            }
-            if (doShiftForces)
+            if constexpr (computeForces)
              {
-                gmx::transposeScatterIncrU<3>(
-                        reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
+                gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
+
+                if (doShiftForces)
+                {
+                    gmx::transposeScatterIncrU<3>(
+                            reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
+                }
              }
              if (doPotential)
              {
@@ -985,95 +1172,115 @@ typedef void (*KernelFunction)(const t_nblist&
                                 int                                              flags,
                                 gmx::ArrayRef<const real>                        lambda,
                                 t_nrnb* gmx_restrict                             nrnb,
-                               gmx::RVec*                                       threadForceBuffer,
+                               gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
                                 rvec*               threadForceShiftBuffer,
                                 gmx::ArrayRef<real> threadVc,
                                 gmx::ArrayRef<real> threadVv,
                                 gmx::ArrayRef<real> threadDvdl);
  
-template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
  static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
  {
      if (useSimd)
      {
  #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
-        return (nb_free_energy_kernel<SimdDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
+        return (nb_free_energy_kernel<SimdDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
  #else
-        return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
+        return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
  #endif
      }
      else
      {
-        return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
+        return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
      }
  }
  
-template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
-static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch, const bool useSimd)
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
+static KernelFunction dispatchKernelOnComputeForces(const bool computeForces, const bool useSimd)
  {
-    if (vdwModifierIsPotSwitch)
+    if (computeForces)
      {
-        return (dispatchKernelOnUseSimd<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
+        return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, true>(
                  useSimd));
      }
      else
      {
-        return (dispatchKernelOnUseSimd<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
+        return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, false>(
                  useSimd));
      }
  }
  
-template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
+static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch,
+                                                  const bool computeForces,
+                                                  const bool useSimd)
+{
+    if (vdwModifierIsPotSwitch)
+    {
+        return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
+                computeForces, useSimd));
+    }
+    else
+    {
+        return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
+                computeForces, useSimd));
+    }
+}
+
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
  static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
                                                            const bool vdwModifierIsPotSwitch,
+                                                          const bool computeForces,
                                                            const bool useSimd)
  {
      if (elecInteractionTypeIsEwald)
      {
-        return (dispatchKernelOnVdwModifier<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
-                vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
+                vdwModifierIsPotSwitch, computeForces, useSimd));
      }
      else
      {
-        return (dispatchKernelOnVdwModifier<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
-                vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
+                vdwModifierIsPotSwitch, computeForces, useSimd));
      }
  }
  
-template<bool useSoftCore, bool scLambdasOrAlphasDiffer>
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer>
  static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
                                                           const bool elecInteractionTypeIsEwald,
                                                           const bool vdwModifierIsPotSwitch,
+                                                         const bool computeForces,
                                                           const bool useSimd)
  {
      if (vdwInteractionTypeIsEwald)
      {
-        return (dispatchKernelOnElecInteractionType<useSoftCore, scLambdasOrAlphasDiffer, true>(
-                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, true>(
+                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
      }
      else
      {
-        return (dispatchKernelOnElecInteractionType<useSoftCore, scLambdasOrAlphasDiffer, false>(
-                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, false>(
+                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
      }
  }
  
-template<bool useSoftCore>
+template<KernelSoftcoreType softcoreType>
  static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
                                                                    const bool vdwInteractionTypeIsEwald,
                                                                    const bool elecInteractionTypeIsEwald,
                                                                    const bool vdwModifierIsPotSwitch,
+                                                                  const bool computeForces,
                                                                    const bool useSimd)
  {
      if (scLambdasOrAlphasDiffer)
      {
-        return (dispatchKernelOnVdwInteractionType<useSoftCore, true>(
-                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnVdwInteractionType<softcoreType, true>(
+                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
      }
      else
      {
-        return (dispatchKernelOnVdwInteractionType<useSoftCore, false>(
-                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnVdwInteractionType<softcoreType, false>(
+                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
      }
  }
  
@@ -1081,24 +1288,50 @@ static KernelFunction dispatchKernel(const bool                 scLambdasOrAlpha
                                       const bool                 vdwInteractionTypeIsEwald,
                                       const bool                 elecInteractionTypeIsEwald,
                                       const bool                 vdwModifierIsPotSwitch,
+                                     const bool                 computeForces,
                                       const bool                 useSimd,
                                       const interaction_const_t& ic)
  {
-    if (ic.softCoreParameters->alphaCoulomb == 0 && ic.softCoreParameters->alphaVdw == 0)
+    const auto& scParams = *ic.softCoreParameters;
+    if (scParams.softcoreType == SoftcoreType::Beutler)
      {
-        return (dispatchKernelOnScLambdasOrAlphasDifference<false>(scLambdasOrAlphasDiffer,
-                                                                   vdwInteractionTypeIsEwald,
-                                                                   elecInteractionTypeIsEwald,
-                                                                   vdwModifierIsPotSwitch,
-                                                                   useSimd));
+        if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
+        {
+            return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
+                    scLambdasOrAlphasDiffer,
+                    vdwInteractionTypeIsEwald,
+                    elecInteractionTypeIsEwald,
+                    vdwModifierIsPotSwitch,
+                    computeForces,
+                    useSimd));
+        }
+        return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Beutler>(
+                scLambdasOrAlphasDiffer,
+                vdwInteractionTypeIsEwald,
+                elecInteractionTypeIsEwald,
+                vdwModifierIsPotSwitch,
+                computeForces,
+                useSimd));
      }
-    else
+    else // Gapsys
      {
-        return (dispatchKernelOnScLambdasOrAlphasDifference<true>(scLambdasOrAlphasDiffer,
-                                                                  vdwInteractionTypeIsEwald,
-                                                                  elecInteractionTypeIsEwald,
-                                                                  vdwModifierIsPotSwitch,
-                                                                  useSimd));
+        if (scParams.gapsysScaleLinpointCoul == 0 && scParams.gapsysScaleLinpointVdW == 0)
+        {
+            return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
+                    scLambdasOrAlphasDiffer,
+                    vdwInteractionTypeIsEwald,
+                    elecInteractionTypeIsEwald,
+                    vdwModifierIsPotSwitch,
+                    computeForces,
+                    useSimd));
+        }
+        return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Gapsys>(
+                scLambdasOrAlphasDiffer,
+                vdwInteractionTypeIsEwald,
+                elecInteractionTypeIsEwald,
+                vdwModifierIsPotSwitch,
+                computeForces,
+                useSimd));
      }
  }
  
@@ -1119,7 +1352,7 @@ void gmx_nb_free_energy_kernel(const t_nblist&
                                 int                                              flags,
                                 gmx::ArrayRef<const real>                        lambda,
                                 t_nrnb*                                          nrnb,
-                               gmx::RVec*                                       threadForceBuffer,
+                               gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
                                 rvec*               threadForceShiftBuffer,
                                 gmx::ArrayRef<real> threadVc,
                                 gmx::ArrayRef<real> threadVv,
@@ -1129,10 +1362,16 @@ void gmx_nb_free_energy_kernel(const t_nblist&
                 "Unsupported eeltype with free energy");
      GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
  
+    // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
+    GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
+                       || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
+               "We need actual padding with at least one element for SIMD scatter operations");
+
      const auto& scParams                   = *ic.softCoreParameters;
      const bool  vdwInteractionTypeIsEwald  = (EVDW_PME(ic.vdwtype));
      const bool  elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
      const bool  vdwModifierIsPotSwitch     = (ic.vdw_modifier == InteractionModifiers::PotSwitch);
+    const bool  computeForces              = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
      bool        scLambdasOrAlphasDiffer    = true;
  
      if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
@@ -1154,6 +1393,7 @@ void gmx_nb_free_energy_kernel(const t_nblist&
                                  vdwInteractionTypeIsEwald,
                                  elecInteractionTypeIsEwald,
                                  vdwModifierIsPotSwitch,
+                                computeForces,
                                  useSimd,
                                  ic);
      kernelFunc(nlist,