Avoid comparing Simd types to real 0 when making masks in NB FEP kernel.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_free_energy.cpp
index ad316bd5da5020a31631387cbb416f019b398993..85f998cfc546412a2fafffbac2623f846656fe4a 100644 (file)
@@ -63,6 +63,7 @@
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/arrayref.h"
 
+#include "nb_softcore.h"
 
 //! Scalar (non-SIMD) data types.
 struct ScalarDataTypes
@@ -90,46 +91,71 @@ struct SimdDataTypes
 };
 #endif
 
-template<class RealType, class BoolType>
+/*! \brief Lower limit for square interaction distances in nonbonded kernels.
+ *
+ * This is a mimimum on r^2 to avoid overflows when computing r^6.
+ * This will only affect results for soft-cored interaction at distances smaller
+ * than 1e-6 and will limit extremely high foreign energies for overlapping atoms.
+ * Note that we could use a somewhat smaller minimum in double precision.
+ * But because invsqrt in double precision can use single precision, this number
+ * can not be much smaller, we use the same number for simplicity.
+ */
+constexpr real c_minDistanceSquared = 1.0e-12_real;
+
+/*! \brief Higher limit for r^-6 used for Lennard-Jones interactions
+ *
+ * This is needed to avoid overflow of LJ energy and force terms for excluded
+ * atoms and foreign energies of hard-core states of overlapping atoms.
+ * Note that in single precision this value leaves room for C12 coefficients up to 3.4e8.
+ */
+constexpr real c_maxRInvSix = 1.0e15_real;
+
+template<bool computeForces, class RealType>
 static inline void
-pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType* force, const BoolType mask)
+pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType gmx_unused* force)
 {
-    const RealType brsq = gmx::selectByMask(rSq * beta * beta, mask);
-    *force              = -brsq * beta * gmx::pmeForceCorrection(brsq);
-    *pot                = beta * gmx::pmePotentialCorrection(brsq);
+    const RealType brsq = rSq * beta * beta;
+    if constexpr (computeForces)
+    {
+        *force = -brsq * beta * gmx::pmeForceCorrection(brsq);
+    }
+    *pot = beta * gmx::pmePotentialCorrection(brsq);
 }
 
-template<class RealType, class BoolType>
+template<bool computeForces, class RealType, class BoolType>
 static inline void pmeLJCorrectionVF(const RealType rInv,
                                      const RealType rSq,
                                      const real     ewaldLJCoeffSq,
                                      const real     ewaldLJCoeffSixDivSix,
                                      RealType*      pot,
-                                     RealType*      force,
-                                     const BoolType mask,
-                                     const BoolType bIiEqJnr)
+                                     RealType gmx_unused* force,
+                                     const BoolType       mask,
+                                     const BoolType       bIiEqJnr)
 {
     // We mask rInv to get zero force and potential for masked out pair interactions
-    const RealType rInvSq  = gmx::selectByMask(rInv * rInv, mask);
+    const RealType rInvSq  = rInv * rInv;
     const RealType rInvSix = rInvSq * rInvSq * rInvSq;
     // Mask rSq to avoid underflow in exp()
     const RealType coeffSqRSq       = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask);
     const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq);
     const RealType poly             = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq;
-    *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
-    *force = *force * rInvSq;
+    if constexpr (computeForces)
+    {
+        *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
+        *force = *force * rInvSq;
+    }
     // The self interaction is the limit for r -> 0 which we need to compute separately
     *pot = gmx::blend(
             rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr);
 }
 
-//! Computes r^(1/p) and 1/r^(1/p) for the standard p=6
-template<class RealType, class BoolType>
-static inline void pthRoot(const RealType r, RealType* pthRoot, RealType* invPthRoot, const BoolType mask)
+//! Computes r^(1/6) and 1/r^(1/6)
+template<class RealType>
+static inline void sixthRoot(const RealType r, RealType* sixthRoot, RealType* invSixthRoot)
 {
     RealType cbrtRes = gmx::cbrt(r);
-    *invPthRoot      = gmx::maskzInvsqrt(cbrtRes, mask);
-    *pthRoot         = gmx::maskzInv(*invPthRoot, mask);
+    *invSixthRoot    = gmx::invsqrt(cbrtRes);
+    *sixthRoot       = gmx::inv(*invSixthRoot);
 }
 
 template<class RealType>
@@ -232,7 +258,7 @@ static inline RealType potSwitchPotentialMod(const RealType potentialInp, const
 
 
 //! Templated free-energy non-bonded kernel
-template<typename DataTypes, bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
+template<typename DataTypes, KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
 static void nb_free_energy_kernel(const t_nblist&                                  nlist,
                                   const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
                                   const int                                        ntype,
@@ -248,8 +274,8 @@ static void nb_free_energy_kernel(const t_nblist&
                                   int                                              flags,
                                   gmx::ArrayRef<const real>                        lambda,
                                   t_nrnb* gmx_restrict                             nrnb,
-                                  gmx::RVec*          threadForceBuffer,
-                                  rvec*               threadForceShiftBuffer,
+                                  gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
+                                  rvec gmx_unused*    threadForceShiftBuffer,
                                   gmx::ArrayRef<real> threadVc,
                                   gmx::ArrayRef<real> threadVv,
                                   gmx::ArrayRef<real> threadDvdl)
@@ -278,16 +304,22 @@ static void nb_free_energy_kernel(const t_nblist&
     gmx::ArrayRef<const int> shift  = nlist.shift;
     gmx::ArrayRef<const int> gid    = nlist.gid;
 
-    const real  lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
-    const real  lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
-    const auto& scParams    = *ic.softCoreParameters;
-    const real gmx_unused alpha_coul    = scParams.alphaCoulomb;
-    const real gmx_unused alpha_vdw     = scParams.alphaVdw;
-    const real            lam_power     = scParams.lambdaPower;
-    const real gmx_unused sigma6_def    = scParams.sigma6WithInvalidSigma;
-    const real gmx_unused sigma6_min    = scParams.sigma6Minimum;
-    const bool            doForces      = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
-    const bool            doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
+    const real lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
+    const real lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
+
+    // Extract softcore parameters
+    const auto&           scParams   = *ic.softCoreParameters;
+    const real            lam_power  = scParams.lambdaPower;
+    const real gmx_unused alpha_coul = scParams.alphaCoulomb;
+    const real gmx_unused alpha_vdw  = scParams.alphaVdw;
+    const real gmx_unused sigma6_def = scParams.sigma6WithInvalidSigma;
+    const real gmx_unused sigma6_min = scParams.sigma6Minimum;
+
+    const real gmx_unused gapsysScaleLinpointCoul = scParams.gapsysScaleLinpointCoul;
+    const real gmx_unused gapsysScaleLinpointVdW  = scParams.gapsysScaleLinpointVdW;
+    const real gmx_unused gapsysSigma6VdW         = scParams.gapsysSigma6VdW;
+
+    const bool gmx_unused doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
     const bool            doPotential   = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
 
     // Extract data from interaction_const_t
@@ -340,8 +372,9 @@ static void nb_free_energy_kernel(const t_nblist&
         icoul = NbkernelElecType::None;
     }
 
-    real rcutoff_max2 = std::max(ic.rcoulomb, ic.rvdw);
-    rcutoff_max2      = rcutoff_max2 * rcutoff_max2;
+    real rcutoff_max2                 = std::max(ic.rcoulomb, ic.rvdw);
+    rcutoff_max2                      = rcutoff_max2 * rcutoff_max2;
+    const real gmx_unused rCutoffCoul = ic.rcoulomb;
 
     real gmx_unused sh_ewald = 0;
     if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
@@ -364,8 +397,12 @@ static void nb_free_energy_kernel(const t_nblist&
     GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
                        "Can not apply soft-core to switched Ewald potentials");
 
-    RealType dvdlCoul(zero);
-    RealType dvdlVdw(zero);
+    const RealType            minDistanceSquared(c_minDistanceSquared);
+    const RealType            maxRInvSix(c_maxRInvSix);
+    const RealType gmx_unused floatMin(GMX_FLOAT_MIN);
+
+    RealType dvdlCoul{ zero };
+    RealType dvdlVdw{ zero };
 
     /* Lambda factor for state A, 1-lambda*/
     real LFC[NSTATES], LFV[NSTATES];
@@ -391,8 +428,20 @@ static void nb_free_energy_kernel(const t_nblist&
         dlFacVdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
     }
 
-    // TODO: We should get rid of using pointers to real
+    // We need pointers to real for SIMD access
     const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
+    real* gmx_restrict       forceRealPtr;
+    if constexpr (computeForces)
+    {
+        GMX_ASSERT(nri == 0 || !threadForceBuffer.empty(), "need a valid threadForceBuffer");
+
+        forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
+
+        if (doShiftForces)
+        {
+            GMX_ASSERT(threadForceShiftBuffer != nullptr, "need a valid threadForceShiftBuffer");
+        }
+    }
 
     const real rlistSquared = gmx::square(rlist);
 
@@ -424,16 +473,16 @@ static void nb_free_energy_kernel(const t_nblist&
         RealType   fIZ(0);
 
 #if GMX_SIMD_HAVE_REAL
-        alignas(GMX_SIMD_ALIGNMENT) int preloadIi[DataTypes::simdRealWidth];
-        alignas(GMX_SIMD_ALIGNMENT) int preloadIs[DataTypes::simdRealWidth];
+        alignas(GMX_SIMD_ALIGNMENT) int            preloadIi[DataTypes::simdRealWidth];
+        alignas(GMX_SIMD_ALIGNMENT) int gmx_unused preloadIs[DataTypes::simdRealWidth];
 #else
-        int preloadIi[DataTypes::simdRealWidth];
-        int preloadIs[DataTypes::simdRealWidth];
+        int            preloadIi[DataTypes::simdRealWidth];
+        int gmx_unused preloadIs[DataTypes::simdRealWidth];
 #endif
-        for (int s = 0; s < DataTypes::simdRealWidth; s++)
+        for (int i = 0; i < DataTypes::simdRealWidth; i++)
         {
-            preloadIi[s] = ii;
-            preloadIs[s] = shift[n];
+            preloadIi[i] = ii;
+            preloadIs[i] = shift[n];
         }
         IntType ii_s = gmx::load<IntType>(preloadIi);
 
@@ -450,6 +499,12 @@ static void nb_free_energy_kernel(const t_nblist&
             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
             alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysSigma6VdW[NSTATES][DataTypes::simdRealWidth];
             alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
 #else
             real            preloadPairIsValid[DataTypes::simdRealWidth];
@@ -460,84 +515,120 @@ static void nb_free_energy_kernel(const t_nblist&
             real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
             real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
             real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysSigma6VdW[NSTATES][DataTypes::simdRealWidth];
             real            preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
 #endif
-            for (int s = 0; s < DataTypes::simdRealWidth; s++)
+            for (int j = 0; j < DataTypes::simdRealWidth; j++)
             {
-                if (k + s < nj1)
+                if (k + j < nj1)
                 {
-                    preloadPairIsValid[s] = true;
+                    preloadPairIsValid[j] = true;
                     /* Check if this pair on the exclusions list.*/
-                    preloadPairIncluded[s]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + s]);
-                    const int jnr           = jjnr[k + s];
-                    preloadJnr[s]           = jnr;
-                    typeIndices[STATE_A][s] = ntiA + typeA[jnr];
-                    typeIndices[STATE_B][s] = ntiB + typeB[jnr];
-                    preloadQq[STATE_A][s]   = iqA * chargeA[jnr];
-                    preloadQq[STATE_B][s]   = iqB * chargeB[jnr];
+                    preloadPairIncluded[j]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + j]);
+                    const int jnr           = jjnr[k + j];
+                    preloadJnr[j]           = jnr;
+                    typeIndices[STATE_A][j] = ntiA + typeA[jnr];
+                    typeIndices[STATE_B][j] = ntiB + typeB[jnr];
+                    preloadQq[STATE_A][j]   = iqA * chargeA[jnr];
+                    preloadQq[STATE_B][j]   = iqB * chargeB[jnr];
 
                     for (int i = 0; i < NSTATES; i++)
                     {
                         if constexpr (vdwInteractionTypeIsEwald)
                         {
-                            preloadLjPmeC6Grid[i][s] = nbfp_grid[2 * typeIndices[i][s]];
+                            preloadLjPmeC6Grid[i][j] = nbfp_grid[2 * typeIndices[i][j]];
                         }
                         else
                         {
-                            preloadLjPmeC6Grid[i][s] = 0;
+                            preloadLjPmeC6Grid[i][j] = 0;
                         }
-                        if constexpr (useSoftCore)
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                         {
-                            const real c6  = nbfp[2 * typeIndices[i][s]];
-                            const real c12 = nbfp[2 * typeIndices[i][s] + 1];
+                            const real c6  = nbfp[2 * typeIndices[i][j]];
+                            const real c12 = nbfp[2 * typeIndices[i][j] + 1];
                             if (c6 > 0 && c12 > 0)
                             {
                                 /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
-                                preloadSigma6[i][s] = 0.5_real * c12 / c6;
-                                if (preloadSigma6[i][s]
+                                preloadSigma6[i][j] = 0.5_real * c12 / c6;
+                                if (preloadSigma6[i][j]
                                     < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
                                 {
-                                    preloadSigma6[i][s] = sigma6_min;
+                                    preloadSigma6[i][j] = sigma6_min;
                                 }
                             }
                             else
                             {
-                                preloadSigma6[i][s] = sigma6_def;
+                                preloadSigma6[i][j] = sigma6_def;
+                            }
+                        }
+                        if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                        {
+                            const real c6  = nbfp[2 * typeIndices[i][j]];
+                            const real c12 = nbfp[2 * typeIndices[i][j] + 1];
+                            if (c6 > 0 && c12 > 0)
+                            {
+                                /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
+                                preloadGapsysSigma6VdW[i][j] = 0.5_real * c12 / c6;
+                            }
+                            else
+                            {
+                                preloadGapsysSigma6VdW[i][j] = gapsysSigma6VdW;
                             }
                         }
                     }
-                    if constexpr (useSoftCore)
+                    if constexpr (softcoreType == KernelSoftcoreType::Beutler)
+                    {
+                        /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
+                        const real c12A = nbfp[2 * typeIndices[STATE_A][j] + 1];
+                        const real c12B = nbfp[2 * typeIndices[STATE_B][j] + 1];
+                        if (c12A > 0 && c12B > 0)
+                        {
+                            preloadAlphaVdwEff[j]  = 0;
+                            preloadAlphaCoulEff[j] = 0;
+                        }
+                        else
+                        {
+                            preloadAlphaVdwEff[j]  = alpha_vdw;
+                            preloadAlphaCoulEff[j] = alpha_coul;
+                        }
+                    }
+                    if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
                     {
                         /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
-                        const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1];
-                        const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1];
+                        const real c12A = nbfp[2 * typeIndices[STATE_A][j] + 1];
+                        const real c12B = nbfp[2 * typeIndices[STATE_B][j] + 1];
                         if (c12A > 0 && c12B > 0)
                         {
-                            preloadAlphaVdwEff[s]  = 0;
-                            preloadAlphaCoulEff[s] = 0;
+                            preloadGapsysScaleLinpointVdW[j]  = 0;
+                            preloadGapsysScaleLinpointCoul[j] = 0;
                         }
                         else
                         {
-                            preloadAlphaVdwEff[s]  = alpha_vdw;
-                            preloadAlphaCoulEff[s] = alpha_coul;
+                            preloadGapsysScaleLinpointVdW[j]  = gapsysScaleLinpointVdW;
+                            preloadGapsysScaleLinpointCoul[j] = gapsysScaleLinpointCoul;
                         }
                     }
                 }
                 else
                 {
-                    preloadJnr[s]          = jjnr[k];
-                    preloadPairIsValid[s]  = false;
-                    preloadPairIncluded[s] = false;
-                    preloadAlphaVdwEff[s]  = 0;
-                    preloadAlphaCoulEff[s] = 0;
-
+                    preloadJnr[j]                     = jjnr[k];
+                    preloadPairIsValid[j]             = false;
+                    preloadPairIncluded[j]            = false;
+                    preloadAlphaVdwEff[j]             = 0;
+                    preloadAlphaCoulEff[j]            = 0;
+                    preloadGapsysScaleLinpointVdW[j]  = 0;
+                    preloadGapsysScaleLinpointCoul[j] = 0;
+
+                    typeIndices[STATE_A][j] = ntiA + typeA[jjnr[k]];
+                    typeIndices[STATE_B][j] = ntiB + typeB[jjnr[k]];
                     for (int i = 0; i < NSTATES; i++)
                     {
-                        typeIndices[STATE_A][s]  = ntiA + typeA[jjnr[k]];
-                        typeIndices[STATE_B][s]  = ntiB + typeB[jjnr[k]];
-                        preloadLjPmeC6Grid[i][s] = 0;
-                        preloadQq[i][s]          = 0;
-                        preloadSigma6[i][s]      = 0;
+                        preloadLjPmeC6Grid[i][j]     = 0;
+                        preloadQq[i][j]              = 0;
+                        preloadSigma6[i][j]          = 0;
+                        preloadGapsysSigma6VdW[i][j] = 0;
                     }
                 }
             }
@@ -553,7 +644,7 @@ static void nb_free_energy_kernel(const t_nblist&
             const RealType dX  = ix - jx;
             const RealType dY  = iy - jy;
             const RealType dZ  = iz - jz;
-            const RealType rSq = dX * dX + dY * dY + dZ * dZ;
+            RealType       rSq = dX * dX + dY * dY + dZ * dZ;
 
             BoolType withinCutoffMask = (rSq < rcutoff_max2);
 
@@ -589,33 +680,41 @@ static void nb_free_energy_kernel(const t_nblist&
             RealType gmx_unused ljPmeC6Grid[NSTATES];
             RealType gmx_unused alphaVdwEff;
             RealType gmx_unused alphaCoulEff;
+            RealType gmx_unused gapsysScaleLinpointVdWEff;
+            RealType gmx_unused gapsysScaleLinpointCoulEff;
+            RealType gmx_unused gapsysSigma6VdWEff[NSTATES];
             for (int i = 0; i < NSTATES; i++)
             {
                 gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]);
                 qq[i]          = gmx::load<RealType>(preloadQq[i]);
                 ljPmeC6Grid[i] = gmx::load<RealType>(preloadLjPmeC6Grid[i]);
-                if constexpr (useSoftCore)
+                if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                 {
                     sigma6[i] = gmx::load<RealType>(preloadSigma6[i]);
                 }
+                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                {
+                    gapsysSigma6VdWEff[i] = gmx::load<RealType>(preloadGapsysSigma6VdW[i]);
+                }
             }
-            if constexpr (useSoftCore)
+            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
             {
                 alphaVdwEff  = gmx::load<RealType>(preloadAlphaVdwEff);
                 alphaCoulEff = gmx::load<RealType>(preloadAlphaCoulEff);
             }
+            if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+            {
+                gapsysScaleLinpointVdWEff  = gmx::load<RealType>(preloadGapsysScaleLinpointVdW);
+                gapsysScaleLinpointCoulEff = gmx::load<RealType>(preloadGapsysScaleLinpointCoul);
+            }
 
-            BoolType rSqValid = (zero < rSq);
-
-            /* The force at r=0 is zero, because of symmetry.
-             * But note that the potential is in general non-zero,
-             * since the soft-cored r will be non-zero.
-             */
-            rInv = gmx::maskzInvsqrt(rSq, rSqValid);
+            // Avoid overflow of r^-12 at distances near zero
+            rSq  = gmx::max(rSq, minDistanceSquared);
+            rInv = gmx::invsqrt(rSq);
             r    = rSq * rInv;
 
             RealType gmx_unused rp, rpm2;
-            if constexpr (useSoftCore)
+            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
             {
                 rpm2 = rSq * rSq;  /* r4 */
                 rp   = rpm2 * rSq; /* r6 */
@@ -636,8 +735,8 @@ static void nb_free_energy_kernel(const t_nblist&
              * bPairIncluded is true then withinCutoffMask must also be true. */
             if (gmx::anyTrue(withinCutoffMask && bPairIncluded))
             {
-                RealType fScalC[NSTATES], fScalV[NSTATES];
-                RealType vCoul[NSTATES], vVdw[NSTATES];
+                RealType gmx_unused fScalC[NSTATES], fScalV[NSTATES];
+                RealType            vCoul[NSTATES], vVdw[NSTATES];
                 for (int i = 0; i < NSTATES; i++)
                 {
                     fScalC[i] = zero;
@@ -653,19 +752,17 @@ static void nb_free_energy_kernel(const t_nblist&
                                              && bPairIncluded && withinCutoffMask);
                     if (gmx::anyTrue(nonZeroState))
                     {
-                        if constexpr (useSoftCore)
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                         {
-                            RealType divisor      = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
-                            BoolType validDivisor = (zero < divisor);
-                            rPInvC                = gmx::maskzInv(divisor, validDivisor);
-                            pthRoot(rPInvC, &rInvC, &rC, validDivisor);
+                            RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
+                            rPInvC           = gmx::inv(divisor);
+                            sixthRoot(rPInvC, &rInvC, &rC);
 
                             if constexpr (scLambdasOrAlphasDiffer)
                             {
-                                RealType divisor      = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
-                                BoolType validDivisor = (zero < divisor);
-                                rPInvV                = gmx::maskzInv(divisor, validDivisor);
-                                pthRoot(rPInvV, &rInvV, &rV, validDivisor);
+                                RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
+                                rPInvV           = gmx::inv(divisor);
+                                sixthRoot(rPInvV, &rInvV, &rV);
                             }
                             else
                             {
@@ -703,17 +800,60 @@ static void nb_free_energy_kernel(const t_nblist&
                         {
                             if constexpr (elecInteractionTypeIsEwald)
                             {
-                                vCoul[i]  = ewaldPotential(qq[i], rInvC, sh_ewald);
-                                fScalC[i] = ewaldScalarForce(qq[i], rInvC);
+                                vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald);
+                                if constexpr (computeForces)
+                                {
+                                    fScalC[i] = ewaldScalarForce(qq[i], rInvC);
+                                }
+
+                                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                                {
+                                    ewaldQuadraticPotential<computeForces>(qq[i],
+                                                                           facel,
+                                                                           rC,
+                                                                           rCutoffCoul,
+                                                                           LFC[i],
+                                                                           DLF[i],
+                                                                           gapsysScaleLinpointCoulEff,
+                                                                           sh_ewald,
+                                                                           &fScalC[i],
+                                                                           &vCoul[i],
+                                                                           &dvdlCoul,
+                                                                           computeElecInteraction);
+                                }
                             }
                             else
                             {
-                                vCoul[i]  = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
-                                fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
+                                vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
+                                if constexpr (computeForces)
+                                {
+                                    fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
+                                }
+
+                                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                                {
+                                    reactionFieldQuadraticPotential<computeForces>(
+                                            qq[i],
+                                            facel,
+                                            rC,
+                                            rCutoffCoul,
+                                            LFC[i],
+                                            DLF[i],
+                                            gapsysScaleLinpointCoulEff,
+                                            krf,
+                                            crf,
+                                            &fScalC[i],
+                                            &vCoul[i],
+                                            &dvdlCoul,
+                                            computeElecInteraction);
+                                }
                             }
 
-                            vCoul[i]  = gmx::selectByMask(vCoul[i], computeElecInteraction);
-                            fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
+                            vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction);
+                            if constexpr (computeForces)
+                            {
+                                fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
+                            }
                         }
 
                         /* Only process the VDW interactions if we either
@@ -724,17 +864,17 @@ static void nb_free_energy_kernel(const t_nblist&
                         if constexpr (vdwInteractionTypeIsEwald)
                         {
                             computeVdwInteraction =
-                                    (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
+                                    (r < rVdw && (c6[i] != zero || c12[i] != zero) && bPairIncluded);
                         }
                         else
                         {
                             computeVdwInteraction =
-                                    (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
+                                    (rV < rVdw && (c6[i] != zero || c12[i] != zero) && bPairIncluded);
                         }
                         if (gmx::anyTrue(computeVdwInteraction))
                         {
                             RealType rInv6;
-                            if constexpr (useSoftCore)
+                            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                             {
                                 rInv6 = rPInvV;
                             }
@@ -742,12 +882,39 @@ static void nb_free_energy_kernel(const t_nblist&
                             {
                                 rInv6 = calculateRinv6(rInvV);
                             }
+                            // Avoid overflow at short distance for masked exclusions and
+                            // for foreign energy calculations at a hard core end state.
+                            // Note that we should limit r^-6, and thus also r^-12, and
+                            // not only r^-12, as that could lead to erroneously low instead
+                            // of very high foreign energies.
+                            rInv6           = gmx::min(rInv6, maxRInvSix);
                             RealType vVdw6  = calculateVdw6(c6[i], rInv6);
                             RealType vVdw12 = calculateVdw12(c12[i], rInv6);
 
                             vVdw[i] = lennardJonesPotential(
                                     vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth);
-                            fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
+                            if constexpr (computeForces)
+                            {
+                                fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
+                            }
+
+                            if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                            {
+                                lennardJonesQuadraticPotential<computeForces>(c6[i],
+                                                                              c12[i],
+                                                                              r,
+                                                                              rSq,
+                                                                              LFV[i],
+                                                                              DLF[i],
+                                                                              gapsysSigma6VdWEff[i],
+                                                                              gapsysScaleLinpointVdWEff,
+                                                                              repulsionShift,
+                                                                              dispersionShift,
+                                                                              &fScalV[i],
+                                                                              &vVdw[i],
+                                                                              &dvdlVdw,
+                                                                              computeVdwInteraction);
+                            }
 
                             if constexpr (vdwInteractionTypeIsEwald)
                             {
@@ -767,24 +934,33 @@ static void nb_free_energy_kernel(const t_nblist&
                                 const RealType d2      = d * d;
                                 const RealType sw =
                                         one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
-                                const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
 
-                                fScalV[i] = potSwitchScalarForceMod(
-                                        fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
+                                if constexpr (computeForces)
+                                {
+                                    const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
+                                    fScalV[i]          = potSwitchScalarForceMod(
+                                            fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
+                                }
                                 vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask);
                             }
 
-                            vVdw[i]   = gmx::selectByMask(vVdw[i], computeVdwInteraction);
-                            fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
+                            vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction);
+                            if constexpr (computeForces)
+                            {
+                                fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
+                            }
                         }
 
-                        /* fScalC (and fScalV) now contain: dV/drC * rC
-                         * Now we multiply by rC^-p, so it will be: dV/drC * rC^1-p
-                         * Further down we first multiply by r^p-2 and then by
-                         * the vector r, which in total gives: dV/drC * (r/rC)^1-p
-                         */
-                        fScalC[i] = fScalC[i] * rPInvC;
-                        fScalV[i] = fScalV[i] * rPInvV;
+                        if constexpr (computeForces)
+                        {
+                            /* fScalC (and fScalV) now contain: dV/drC * rC
+                             * Now we multiply by rC^-6, so it will be: dV/drC * rC^-5
+                             * Further down we first multiply by r^4 and then by
+                             * the vector r, which in total gives: dV/drC * (r/rC)^-5
+                             */
+                            fScalC[i] = fScalC[i] * rPInvC;
+                            fScalV[i] = fScalV[i] * rPInvV;
+                        }
                     } // end of block requiring nonZeroState
                 }     // end for (int i = 0; i < NSTATES; i++)
 
@@ -797,10 +973,13 @@ static void nb_free_energy_kernel(const t_nblist&
                         vCTot = vCTot + LFC[i] * vCoul[i];
                         vVTot = vVTot + LFV[i] * vVdw[i];
 
-                        fScal = fScal + LFC[i] * fScalC[i] * rpm2;
-                        fScal = fScal + LFV[i] * fScalV[i] * rpm2;
+                        if constexpr (computeForces)
+                        {
+                            fScal = fScal + LFC[i] * fScalC[i] * rpm2;
+                            fScal = fScal + LFV[i] * fScalV[i] * rpm2;
+                        }
 
-                        if constexpr (useSoftCore)
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                         {
                             dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]
                                        + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i];
@@ -857,8 +1036,11 @@ static void nb_free_energy_kernel(const t_nblist&
                  */
                 RealType v_lr, f_lr;
 
-                pmeCoulombCorrectionVF(rSq, ewaldBeta, &v_lr, &f_lr, rSqValid);
-                f_lr = f_lr * rInv * rInv;
+                pmeCoulombCorrectionVF<computeForces>(rSq, ewaldBeta, &v_lr, &f_lr);
+                if constexpr (computeForces)
+                {
+                    f_lr = f_lr * rInv * rInv;
+                }
 
                 /* Note that any possible Ewald shift has already been applied in
                  * the normal interaction part above.
@@ -874,7 +1056,10 @@ static void nb_free_energy_kernel(const t_nblist&
                 for (int i = 0; i < NSTATES; i++)
                 {
                     vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction);
-                    fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
+                    if constexpr (computeForces)
+                    {
+                        fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
+                    }
                     dvdlCoul = dvdlCoul
                                - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction);
                 }
@@ -893,19 +1078,22 @@ static void nb_free_energy_kernel(const t_nblist&
                  */
 
                 RealType v_lr, f_lr;
-                pmeLJCorrectionVF(
+                pmeLJCorrectionVF<computeForces>(
                         rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr);
                 v_lr = v_lr * oneSixth;
 
                 for (int i = 0; i < NSTATES; i++)
                 {
                     vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
-                    fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
+                    if constexpr (computeForces)
+                    {
+                        fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
+                    }
                     dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
                 }
             }
 
-            if (doForces && gmx::anyTrue(fScal != zero))
+            if (computeForces && gmx::anyTrue(fScal != zero))
             {
                 const RealType tX = fScal * dX;
                 const RealType tY = fScal * dY;
@@ -914,22 +1102,21 @@ static void nb_free_energy_kernel(const t_nblist&
                 fIY               = fIY + tY;
                 fIZ               = fIZ + tZ;
 
-                gmx::transposeScatterDecrU<3>(
-                        reinterpret_cast<real*>(threadForceBuffer), preloadJnr, tX, tY, tZ);
+                gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
             }
         } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
 
         if (havePairsWithinCutoff)
         {
-            if (doForces)
-            {
-                gmx::transposeScatterIncrU<3>(
-                        reinterpret_cast<real*>(threadForceBuffer), preloadIi, fIX, fIY, fIZ);
-            }
-            if (doShiftForces)
+            if constexpr (computeForces)
             {
-                gmx::transposeScatterIncrU<3>(
-                        reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
+                gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
+
+                if (doShiftForces)
+                {
+                    gmx::transposeScatterIncrU<3>(
+                            reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
+                }
             }
             if (doPotential)
             {
@@ -985,95 +1172,115 @@ typedef void (*KernelFunction)(const t_nblist&
                                int                                              flags,
                                gmx::ArrayRef<const real>                        lambda,
                                t_nrnb* gmx_restrict                             nrnb,
-                               gmx::RVec*                                       threadForceBuffer,
+                               gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
                                rvec*               threadForceShiftBuffer,
                                gmx::ArrayRef<real> threadVc,
                                gmx::ArrayRef<real> threadVv,
                                gmx::ArrayRef<real> threadDvdl);
 
-template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
 static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
 {
     if (useSimd)
     {
 #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
-        return (nb_free_energy_kernel<SimdDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
+        return (nb_free_energy_kernel<SimdDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
 #else
-        return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
+        return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
 #endif
     }
     else
     {
-        return (nb_free_energy_kernel<ScalarDataTypes, useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch>);
+        return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
     }
 }
 
-template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
-static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch, const bool useSimd)
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
+static KernelFunction dispatchKernelOnComputeForces(const bool computeForces, const bool useSimd)
 {
-    if (vdwModifierIsPotSwitch)
+    if (computeForces)
     {
-        return (dispatchKernelOnUseSimd<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
+        return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, true>(
                 useSimd));
     }
     else
     {
-        return (dispatchKernelOnUseSimd<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
+        return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, false>(
                 useSimd));
     }
 }
 
-template<bool useSoftCore, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
+static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch,
+                                                  const bool computeForces,
+                                                  const bool useSimd)
+{
+    if (vdwModifierIsPotSwitch)
+    {
+        return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
+                computeForces, useSimd));
+    }
+    else
+    {
+        return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
+                computeForces, useSimd));
+    }
+}
+
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
 static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
                                                           const bool vdwModifierIsPotSwitch,
+                                                          const bool computeForces,
                                                           const bool useSimd)
 {
     if (elecInteractionTypeIsEwald)
     {
-        return (dispatchKernelOnVdwModifier<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
-                vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
+                vdwModifierIsPotSwitch, computeForces, useSimd));
     }
     else
     {
-        return (dispatchKernelOnVdwModifier<useSoftCore, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
-                vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
+                vdwModifierIsPotSwitch, computeForces, useSimd));
     }
 }
 
-template<bool useSoftCore, bool scLambdasOrAlphasDiffer>
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer>
 static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
                                                          const bool elecInteractionTypeIsEwald,
                                                          const bool vdwModifierIsPotSwitch,
+                                                         const bool computeForces,
                                                          const bool useSimd)
 {
     if (vdwInteractionTypeIsEwald)
     {
-        return (dispatchKernelOnElecInteractionType<useSoftCore, scLambdasOrAlphasDiffer, true>(
-                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, true>(
+                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
     }
     else
     {
-        return (dispatchKernelOnElecInteractionType<useSoftCore, scLambdasOrAlphasDiffer, false>(
-                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, false>(
+                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
     }
 }
 
-template<bool useSoftCore>
+template<KernelSoftcoreType softcoreType>
 static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
                                                                   const bool vdwInteractionTypeIsEwald,
                                                                   const bool elecInteractionTypeIsEwald,
                                                                   const bool vdwModifierIsPotSwitch,
+                                                                  const bool computeForces,
                                                                   const bool useSimd)
 {
     if (scLambdasOrAlphasDiffer)
     {
-        return (dispatchKernelOnVdwInteractionType<useSoftCore, true>(
-                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnVdwInteractionType<softcoreType, true>(
+                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
     }
     else
     {
-        return (dispatchKernelOnVdwInteractionType<useSoftCore, false>(
-                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, useSimd));
+        return (dispatchKernelOnVdwInteractionType<softcoreType, false>(
+                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
     }
 }
 
@@ -1081,24 +1288,50 @@ static KernelFunction dispatchKernel(const bool                 scLambdasOrAlpha
                                      const bool                 vdwInteractionTypeIsEwald,
                                      const bool                 elecInteractionTypeIsEwald,
                                      const bool                 vdwModifierIsPotSwitch,
+                                     const bool                 computeForces,
                                      const bool                 useSimd,
                                      const interaction_const_t& ic)
 {
-    if (ic.softCoreParameters->alphaCoulomb == 0 && ic.softCoreParameters->alphaVdw == 0)
+    const auto& scParams = *ic.softCoreParameters;
+    if (scParams.softcoreType == SoftcoreType::Beutler)
     {
-        return (dispatchKernelOnScLambdasOrAlphasDifference<false>(scLambdasOrAlphasDiffer,
-                                                                   vdwInteractionTypeIsEwald,
-                                                                   elecInteractionTypeIsEwald,
-                                                                   vdwModifierIsPotSwitch,
-                                                                   useSimd));
+        if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
+        {
+            return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
+                    scLambdasOrAlphasDiffer,
+                    vdwInteractionTypeIsEwald,
+                    elecInteractionTypeIsEwald,
+                    vdwModifierIsPotSwitch,
+                    computeForces,
+                    useSimd));
+        }
+        return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Beutler>(
+                scLambdasOrAlphasDiffer,
+                vdwInteractionTypeIsEwald,
+                elecInteractionTypeIsEwald,
+                vdwModifierIsPotSwitch,
+                computeForces,
+                useSimd));
     }
-    else
+    else // Gapsys
     {
-        return (dispatchKernelOnScLambdasOrAlphasDifference<true>(scLambdasOrAlphasDiffer,
-                                                                  vdwInteractionTypeIsEwald,
-                                                                  elecInteractionTypeIsEwald,
-                                                                  vdwModifierIsPotSwitch,
-                                                                  useSimd));
+        if (scParams.gapsysScaleLinpointCoul == 0 && scParams.gapsysScaleLinpointVdW == 0)
+        {
+            return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
+                    scLambdasOrAlphasDiffer,
+                    vdwInteractionTypeIsEwald,
+                    elecInteractionTypeIsEwald,
+                    vdwModifierIsPotSwitch,
+                    computeForces,
+                    useSimd));
+        }
+        return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Gapsys>(
+                scLambdasOrAlphasDiffer,
+                vdwInteractionTypeIsEwald,
+                elecInteractionTypeIsEwald,
+                vdwModifierIsPotSwitch,
+                computeForces,
+                useSimd));
     }
 }
 
@@ -1119,7 +1352,7 @@ void gmx_nb_free_energy_kernel(const t_nblist&
                                int                                              flags,
                                gmx::ArrayRef<const real>                        lambda,
                                t_nrnb*                                          nrnb,
-                               gmx::RVec*                                       threadForceBuffer,
+                               gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
                                rvec*               threadForceShiftBuffer,
                                gmx::ArrayRef<real> threadVc,
                                gmx::ArrayRef<real> threadVv,
@@ -1129,10 +1362,16 @@ void gmx_nb_free_energy_kernel(const t_nblist&
                "Unsupported eeltype with free energy");
     GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
 
+    // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
+    GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
+                       || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
+               "We need actual padding with at least one element for SIMD scatter operations");
+
     const auto& scParams                   = *ic.softCoreParameters;
     const bool  vdwInteractionTypeIsEwald  = (EVDW_PME(ic.vdwtype));
     const bool  elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
     const bool  vdwModifierIsPotSwitch     = (ic.vdw_modifier == InteractionModifiers::PotSwitch);
+    const bool  computeForces              = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
     bool        scLambdasOrAlphasDiffer    = true;
 
     if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
@@ -1154,6 +1393,7 @@ void gmx_nb_free_energy_kernel(const t_nblist&
                                 vdwInteractionTypeIsEwald,
                                 elecInteractionTypeIsEwald,
                                 vdwModifierIsPotSwitch,
+                                computeForces,
                                 useSimd,
                                 ic);
     kernelFunc(nlist,