Minor code clean-up in the NB FEP kernel.
[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_free_energy.cpp
index a3c495c885ce4d993188cc24f0f14e5704693840..214e41a9d93de0b01c29b8a0db279bbb4b6e5e79 100644 (file)
@@ -3,7 +3,8 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 
 #include "nb_free_energy.h"
 
+#include "config.h"
+
 #include <cmath>
+#include <set>
 
 #include <algorithm>
 
 #include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
 #include "gromacs/gmxlib/nonbonded/nonbonded.h"
+#include "gromacs/math/arrayrefwithpadding.h"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/vec.h"
+#include "gromacs/mdtypes/forceoutput.h"
 #include "gromacs/mdtypes/forcerec.h"
+#include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/mdtypes/nblist.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
 #include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/arrayref.h"
+
+#include "nb_softcore.h"
 
-void
-gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
-                          rvec * gmx_restrict              xx,
-                          rvec * gmx_restrict              ff,
-                          t_forcerec * gmx_restrict        fr,
-                          const t_mdatoms * gmx_restrict   mdatoms,
-                          nb_kernel_data_t * gmx_restrict  kernel_data,
-                          t_nrnb * gmx_restrict            nrnb)
+//! Scalar (non-SIMD) data types.
+struct ScalarDataTypes
+{
+    using RealType = real; //!< The data type to use as real.
+    using IntType  = int;  //!< The data type to use as int.
+    using BoolType = bool; //!< The data type to use as bool for real value comparison.
+    static constexpr int simdRealWidth = 1; //!< The width of the RealType.
+    static constexpr int simdIntWidth  = 1; //!< The width of the IntType.
+};
+
+#if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS
+//! SIMD data types.
+struct SimdDataTypes
 {
+    using RealType = gmx::SimdReal;  //!< The data type to use as real.
+    using IntType  = gmx::SimdInt32; //!< The data type to use as int.
+    using BoolType = gmx::SimdBool;  //!< The data type to use as bool for real value comparison.
+    static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH; //!< The width of the RealType.
+#    if GMX_SIMD_HAVE_DOUBLE && GMX_DOUBLE
+    static constexpr int simdIntWidth = GMX_SIMD_DINT32_WIDTH; //!< The width of the IntType.
+#    else
+    static constexpr int simdIntWidth = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType.
+#    endif
+};
+#endif
+
+/*! \brief Lower limit for square interaction distances in nonbonded kernels.
+ *
+ * This is a mimimum on r^2 to avoid overflows when computing r^6.
+ * This will only affect results for soft-cored interaction at distances smaller
+ * than 1e-6 and will limit extremely high foreign energies for overlapping atoms.
+ * Note that we could use a somewhat smaller minimum in double precision.
+ * But because invsqrt in double precision can use single precision, this number
+ * can not be much smaller, we use the same number for simplicity.
+ */
+constexpr real c_minDistanceSquared = 1.0e-12_real;
 
-#define  STATE_A  0
-#define  STATE_B  1
-#define  NSTATES  2
-    int           i, n, ii, is3, ii3, k, nj0, nj1, jnr, j3, ggid;
-    real          shX, shY, shZ;
-    real          tx, ty, tz, Fscal;
-    double        FscalC[NSTATES], FscalV[NSTATES];  /* Needs double for sc_power==48 */
-    double        Vcoul[NSTATES], Vvdw[NSTATES];     /* Needs double for sc_power==48 */
-    real          rinv6, r, rtC, rtV;
-    real          iqA, iqB;
-    real          qq[NSTATES], vctot;
-    int           ntiA, ntiB, tj[NSTATES];
-    real          Vvdw6, Vvdw12, vvtot;
-    real          ix, iy, iz, fix, fiy, fiz;
-    real          dx, dy, dz, rsq, rinv;
-    real          c6[NSTATES], c12[NSTATES], c6grid;
-    real          LFC[NSTATES], LFV[NSTATES], DLF[NSTATES];
-    double        dvdl_coul, dvdl_vdw;
-    real          lfac_coul[NSTATES], dlfac_coul[NSTATES], lfac_vdw[NSTATES], dlfac_vdw[NSTATES];
-    real          sigma6[NSTATES], alpha_vdw_eff, alpha_coul_eff, sigma2_def, sigma2_min;
-    double        rp, rpm2, rC, rV, rinvC, rpinvC, rinvV, rpinvV; /* Needs double for sc_power==48 */
-    real          sigma2[NSTATES], sigma_pow[NSTATES];
-    int           do_tab, tab_elemsize = 0;
-    int           n0, n1C, n1V, nnn;
-    real          Y, F, Fp, Geps, Heps2, epsC, eps2C, epsV, eps2V, VV, FF;
-    int           icoul, ivdw;
-    int           nri;
-    const int *   iinr;
-    const int *   jindex;
-    const int *   jjnr;
-    const int *   shift;
-    const int *   gid;
-    const int *   typeA;
-    const int *   typeB;
-    int           ntype;
-    const real *  shiftvec;
-    real *        fshift;
-    real          tabscale = 0;
-    const real *  VFtab    = nullptr;
-    const real *  x;
-    real *        f;
-    real          facel, krf, crf;
-    const real *  chargeA;
-    const real *  chargeB;
-    real          sigma6_min, sigma6_def, lam_power, sc_r_power;
-    real          alpha_coul, alpha_vdw, lambda_coul, lambda_vdw;
-    real          ewcljrsq, ewclj, ewclj2, exponent, poly, vvdw_disp, vvdw_rep, sh_lj_ewald;
-    real          ewclj6;
-    const real *  nbfp, *nbfp_grid;
-    real *        dvdl;
-    real *        Vv;
-    real *        Vc;
-    gmx_bool      bDoForces, bDoShiftForces, bDoPotential;
-    real          rcoulomb, rvdw, sh_invrc6;
-    gmx_bool      bExactElecCutoff, bExactVdwCutoff, bExactCutoffAll;
-    gmx_bool      bEwald, bEwaldLJ;
-    real          rcutoff_max2;
-    const real *  tab_ewald_F_lj = nullptr;
-    const real *  tab_ewald_V_lj = nullptr;
-    real          d, d2, sw, dsw, rinvcorr;
-    real          elec_swV3, elec_swV4, elec_swV5, elec_swF2, elec_swF3, elec_swF4;
-    real          vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
-    gmx_bool      bConvertEwaldToCoulomb, bConvertLJEwaldToLJ6;
-    gmx_bool      bComputeVdwInteraction, bComputeElecInteraction;
-    const real *  ewtab = nullptr;
-    int           ewitab;
-    real          ewrt, eweps, ewtabscale = 0, ewtabhalfspace = 0, sh_ewald = 0;
-
-    const real    onetwelfth  = 1.0/12.0;
-    const real    onesixth    = 1.0/6.0;
-    const real    zero        = 0.0;
-    const real    half        = 0.5;
-    const real    one         = 1.0;
-    const real    two         = 2.0;
-    const real    six         = 6.0;
-    const real    fourtyeight = 48.0;
-
-    /* Extract pointer to non-bonded interaction constants */
-    const interaction_const_t *ic = fr->ic;
-
-    x                   = xx[0];
-    f                   = ff[0];
-
-    fshift              = fr->fshift[0];
-
-    nri                 = nlist->nri;
-    iinr                = nlist->iinr;
-    jindex              = nlist->jindex;
-    jjnr                = nlist->jjnr;
-    icoul               = nlist->ielec;
-    ivdw                = nlist->ivdw;
-    shift               = nlist->shift;
-    gid                 = nlist->gid;
-
-    shiftvec            = fr->shift_vec[0];
-    chargeA             = mdatoms->chargeA;
-    chargeB             = mdatoms->chargeB;
-    facel               = fr->ic->epsfac;
-    krf                 = ic->k_rf;
-    crf                 = ic->c_rf;
-    Vc                  = kernel_data->energygrp_elec;
-    typeA               = mdatoms->typeA;
-    typeB               = mdatoms->typeB;
-    ntype               = fr->ntype;
-    nbfp                = fr->nbfp;
-    nbfp_grid           = fr->ljpme_c6grid;
-    Vv                  = kernel_data->energygrp_vdw;
-    lambda_coul         = kernel_data->lambda[efptCOUL];
-    lambda_vdw          = kernel_data->lambda[efptVDW];
-    dvdl                = kernel_data->dvdl;
-    alpha_coul          = fr->sc_alphacoul;
-    alpha_vdw           = fr->sc_alphavdw;
-    lam_power           = fr->sc_power;
-    sc_r_power          = fr->sc_r_power;
-    sigma6_def          = fr->sc_sigma6_def;
-    sigma6_min          = fr->sc_sigma6_min;
-    bDoForces           = kernel_data->flags & GMX_NONBONDED_DO_FORCE;
-    bDoShiftForces      = kernel_data->flags & GMX_NONBONDED_DO_SHIFTFORCE;
-    bDoPotential        = kernel_data->flags & GMX_NONBONDED_DO_POTENTIAL;
-
-    rcoulomb            = ic->rcoulomb;
-    rvdw                = ic->rvdw;
-    sh_invrc6           = ic->sh_invrc6;
-    sh_lj_ewald         = ic->sh_lj_ewald;
-    ewclj               = ic->ewaldcoeff_lj;
-    ewclj2              = ewclj*ewclj;
-    ewclj6              = ewclj2*ewclj2*ewclj2;
-
-    if (ic->coulomb_modifier == eintmodPOTSWITCH)
+/*! \brief Higher limit for r^-6 used for Lennard-Jones interactions
+ *
+ * This is needed to avoid overflow of LJ energy and force terms for excluded
+ * atoms and foreign energies of hard-core states of overlapping atoms.
+ * Note that in single precision this value leaves room for C12 coefficients up to 3.4e8.
+ */
+constexpr real c_maxRInvSix = 1.0e15_real;
+
+template<bool computeForces, class RealType>
+static inline void
+pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType gmx_unused* force)
+{
+    const RealType brsq = rSq * beta * beta;
+    if constexpr (computeForces)
     {
-        d               = ic->rcoulomb - ic->rcoulomb_switch;
-        elec_swV3       = -10.0/(d*d*d);
-        elec_swV4       =  15.0/(d*d*d*d);
-        elec_swV5       =  -6.0/(d*d*d*d*d);
-        elec_swF2       = -30.0/(d*d*d);
-        elec_swF3       =  60.0/(d*d*d*d);
-        elec_swF4       = -30.0/(d*d*d*d*d);
+        *force = -brsq * beta * gmx::pmeForceCorrection(brsq);
     }
-    else
+    *pot = beta * gmx::pmePotentialCorrection(brsq);
+}
+
+template<bool computeForces, class RealType, class BoolType>
+static inline void pmeLJCorrectionVF(const RealType rInv,
+                                     const RealType rSq,
+                                     const real     ewaldLJCoeffSq,
+                                     const real     ewaldLJCoeffSixDivSix,
+                                     RealType*      pot,
+                                     RealType gmx_unused* force,
+                                     const BoolType       mask,
+                                     const BoolType       bIiEqJnr)
+{
+    // We mask rInv to get zero force and potential for masked out pair interactions
+    const RealType rInvSq  = rInv * rInv;
+    const RealType rInvSix = rInvSq * rInvSq * rInvSq;
+    // Mask rSq to avoid underflow in exp()
+    const RealType coeffSqRSq       = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask);
+    const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq);
+    const RealType poly             = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq;
+    if constexpr (computeForces)
     {
-        /* Avoid warnings from stupid compilers (looking at you, Clang!) */
-        elec_swV3 = elec_swV4 = elec_swV5 = elec_swF2 = elec_swF3 = elec_swF4 = 0.0;
+        *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix);
+        *force = *force * rInvSq;
     }
+    // The self interaction is the limit for r -> 0 which we need to compute separately
+    *pot = gmx::blend(
+            rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr);
+}
+
+//! Computes r^(1/6) and 1/r^(1/6)
+template<class RealType>
+static inline void sixthRoot(const RealType r, RealType* sixthRoot, RealType* invSixthRoot)
+{
+    RealType cbrtRes = gmx::cbrt(r);
+    *invSixthRoot    = gmx::invsqrt(cbrtRes);
+    *sixthRoot       = gmx::inv(*invSixthRoot);
+}
+
+template<class RealType>
+static inline RealType calculateRinv6(const RealType rInvV)
+{
+    RealType rInv6 = rInvV * rInvV;
+    return (rInv6 * rInv6 * rInv6);
+}
 
-    if (ic->vdw_modifier == eintmodPOTSWITCH)
+template<class RealType>
+static inline RealType calculateVdw6(const RealType c6, const RealType rInv6)
+{
+    return (c6 * rInv6);
+}
+
+template<class RealType>
+static inline RealType calculateVdw12(const RealType c12, const RealType rInv6)
+{
+    return (c12 * rInv6 * rInv6);
+}
+
+/* reaction-field electrostatics */
+template<class RealType>
+static inline RealType reactionFieldScalarForce(const RealType qq,
+                                                const RealType rInv,
+                                                const RealType r,
+                                                const real     krf,
+                                                const real     two)
+{
+    return (qq * (rInv - two * krf * r * r));
+}
+template<class RealType>
+static inline RealType reactionFieldPotential(const RealType qq,
+                                              const RealType rInv,
+                                              const RealType r,
+                                              const real     krf,
+                                              const real     potentialShift)
+{
+    return (qq * (rInv + krf * r * r - potentialShift));
+}
+
+/* Ewald electrostatics */
+template<class RealType>
+static inline RealType ewaldScalarForce(const RealType coulomb, const RealType rInv)
+{
+    return (coulomb * rInv);
+}
+template<class RealType>
+static inline RealType ewaldPotential(const RealType coulomb, const RealType rInv, const real potentialShift)
+{
+    return (coulomb * (rInv - potentialShift));
+}
+
+/* cutoff LJ */
+template<class RealType>
+static inline RealType lennardJonesScalarForce(const RealType v6, const RealType v12)
+{
+    return (v12 - v6);
+}
+template<class RealType>
+static inline RealType lennardJonesPotential(const RealType v6,
+                                             const RealType v12,
+                                             const RealType c6,
+                                             const RealType c12,
+                                             const real     repulsionShift,
+                                             const real     dispersionShift,
+                                             const real     oneSixth,
+                                             const real     oneTwelfth)
+{
+    return ((v12 + c12 * repulsionShift) * oneTwelfth - (v6 + c6 * dispersionShift) * oneSixth);
+}
+
+/* Ewald LJ */
+template<class RealType>
+static inline RealType ewaldLennardJonesGridSubtract(const RealType c6grid,
+                                                     const real     potentialShift,
+                                                     const real     oneSixth)
+{
+    return (c6grid * potentialShift * oneSixth);
+}
+
+/* LJ Potential switch */
+template<class RealType, class BoolType>
+static inline RealType potSwitchScalarForceMod(const RealType fScalarInp,
+                                               const RealType potential,
+                                               const RealType sw,
+                                               const RealType r,
+                                               const RealType dsw,
+                                               const BoolType mask)
+{
+    /* The mask should select on rV < rVdw */
+    return (gmx::selectByMask(fScalarInp * sw - r * potential * dsw, mask));
+}
+template<class RealType, class BoolType>
+static inline RealType potSwitchPotentialMod(const RealType potentialInp, const RealType sw, const BoolType mask)
+{
+    /* The mask should select on rV < rVdw */
+    return (gmx::selectByMask(potentialInp * sw, mask));
+}
+
+
+//! Templated free-energy non-bonded kernel
+template<typename DataTypes, KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
+static void nb_free_energy_kernel(const t_nblist&                                  nlist,
+                                  const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
+                                  const int                                        ntype,
+                                  const real                                       rlist,
+                                  const interaction_const_t&                       ic,
+                                  gmx::ArrayRef<const gmx::RVec>                   shiftvec,
+                                  gmx::ArrayRef<const real>                        nbfp,
+                                  gmx::ArrayRef<const real> gmx_unused             nbfp_grid,
+                                  gmx::ArrayRef<const real>                        chargeA,
+                                  gmx::ArrayRef<const real>                        chargeB,
+                                  gmx::ArrayRef<const int>                         typeA,
+                                  gmx::ArrayRef<const int>                         typeB,
+                                  int                                              flags,
+                                  gmx::ArrayRef<const real>                        lambda,
+                                  t_nrnb* gmx_restrict                             nrnb,
+                                  gmx::ArrayRefWithPadding<gmx::RVec> threadForceBuffer,
+                                  rvec gmx_unused*    threadForceShiftBuffer,
+                                  gmx::ArrayRef<real> threadVc,
+                                  gmx::ArrayRef<real> threadVv,
+                                  gmx::ArrayRef<real> threadDvdl)
+{
+#define STATE_A 0
+#define STATE_B 1
+#define NSTATES 2
+
+    using RealType = typename DataTypes::RealType;
+    using IntType  = typename DataTypes::IntType;
+    using BoolType = typename DataTypes::BoolType;
+
+    constexpr real oneTwelfth = 1.0_real / 12.0_real;
+    constexpr real oneSixth   = 1.0_real / 6.0_real;
+    constexpr real zero       = 0.0_real;
+    constexpr real half       = 0.5_real;
+    constexpr real one        = 1.0_real;
+    constexpr real two        = 2.0_real;
+    constexpr real six        = 6.0_real;
+
+    // Extract pair list data
+    const int                nri    = nlist.nri;
+    gmx::ArrayRef<const int> iinr   = nlist.iinr;
+    gmx::ArrayRef<const int> jindex = nlist.jindex;
+    gmx::ArrayRef<const int> jjnr   = nlist.jjnr;
+    gmx::ArrayRef<const int> shift  = nlist.shift;
+    gmx::ArrayRef<const int> gid    = nlist.gid;
+
+    const real lambda_coul = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)];
+    const real lambda_vdw  = lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)];
+
+    // Extract softcore parameters
+    const auto&           scParams   = *ic.softCoreParameters;
+    const real            lam_power  = scParams.lambdaPower;
+    const real gmx_unused alpha_coul = scParams.alphaCoulomb;
+    const real gmx_unused alpha_vdw  = scParams.alphaVdw;
+    const real gmx_unused sigma6_def = scParams.sigma6WithInvalidSigma;
+    const real gmx_unused sigma6_min = scParams.sigma6Minimum;
+
+    const real gmx_unused gapsysScaleLinpointCoul = scParams.gapsysScaleLinpointCoul;
+    const real gmx_unused gapsysScaleLinpointVdW  = scParams.gapsysScaleLinpointVdW;
+    const real gmx_unused gapsysSigma6VdW         = scParams.gapsysSigma6VdW;
+
+    const bool gmx_unused doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0);
+    const bool            doPotential   = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0);
+
+    // Extract data from interaction_const_t
+    const real            facel           = ic.epsfac;
+    const real            rCoulomb        = ic.rcoulomb;
+    const real            krf             = ic.reactionFieldCoefficient;
+    const real            crf             = ic.reactionFieldShift;
+    const real gmx_unused shLjEwald       = ic.sh_lj_ewald;
+    const real            rVdw            = ic.rvdw;
+    const real            dispersionShift = ic.dispersion_shift.cpot;
+    const real            repulsionShift  = ic.repulsion_shift.cpot;
+    const real            ewaldBeta       = ic.ewaldcoeff_q;
+    real gmx_unused       ewaldLJCoeffSq;
+    real gmx_unused       ewaldLJCoeffSixDivSix;
+    if constexpr (vdwInteractionTypeIsEwald)
+    {
+        ewaldLJCoeffSq        = ic.ewaldcoeff_lj * ic.ewaldcoeff_lj;
+        ewaldLJCoeffSixDivSix = ewaldLJCoeffSq * ewaldLJCoeffSq * ewaldLJCoeffSq / six;
+    }
+
+    // Note that the nbnxm kernels do not support Coulomb potential switching at all
+    GMX_ASSERT(ic.coulomb_modifier != InteractionModifiers::PotSwitch,
+               "Potential switching is not supported for Coulomb with FEP");
+
+    const real      rVdwSwitch = ic.rvdw_switch;
+    real gmx_unused vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
+    if constexpr (vdwModifierIsPotSwitch)
     {
-        d               = ic->rvdw - ic->rvdw_switch;
-        vdw_swV3        = -10.0/(d*d*d);
-        vdw_swV4        =  15.0/(d*d*d*d);
-        vdw_swV5        =  -6.0/(d*d*d*d*d);
-        vdw_swF2        = -30.0/(d*d*d);
-        vdw_swF3        =  60.0/(d*d*d*d);
-        vdw_swF4        = -30.0/(d*d*d*d*d);
+        const real d = rVdw - rVdwSwitch;
+        vdw_swV3     = -10.0_real / (d * d * d);
+        vdw_swV4     = 15.0_real / (d * d * d * d);
+        vdw_swV5     = -6.0_real / (d * d * d * d * d);
+        vdw_swF2     = -30.0_real / (d * d * d);
+        vdw_swF3     = 60.0_real / (d * d * d * d);
+        vdw_swF4     = -30.0_real / (d * d * d * d * d);
     }
     else
     {
         /* Avoid warnings from stupid compilers (looking at you, Clang!) */
-        vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = 0.0;
+        vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = zero;
     }
 
-    if (fr->cutoff_scheme == ecutsVERLET)
+    NbkernelElecType icoul;
+    if (ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype))
     {
-        const interaction_const_t *ic = fr->ic;
-
-        if (EVDW_PME(ic->vdwtype))
-        {
-            ivdw         = GMX_NBKERNEL_VDW_LJEWALD;
-        }
-        else
-        {
-            ivdw         = GMX_NBKERNEL_VDW_LENNARDJONES;
-        }
-
-        if (ic->eeltype == eelCUT || EEL_RF(ic->eeltype))
-        {
-            icoul        = GMX_NBKERNEL_ELEC_REACTIONFIELD;
-        }
-        else if (EEL_PME_EWALD(ic->eeltype))
-        {
-            icoul        = GMX_NBKERNEL_ELEC_EWALD;
-        }
-        else
-        {
-            gmx_incons("Unsupported eeltype with Verlet and free-energy");
-        }
-
-        bExactElecCutoff = TRUE;
-        bExactVdwCutoff  = TRUE;
+        icoul = NbkernelElecType::ReactionField;
     }
     else
     {
-        bExactElecCutoff = (ic->coulomb_modifier != eintmodNONE) || ic->eeltype == eelRF_ZERO;
-        bExactVdwCutoff  = (ic->vdw_modifier != eintmodNONE);
+        icoul = NbkernelElecType::None;
     }
 
-    bExactCutoffAll = (bExactElecCutoff && bExactVdwCutoff);
-    rcutoff_max2    = std::max(ic->rcoulomb, ic->rvdw);
-    rcutoff_max2    = rcutoff_max2*rcutoff_max2;
+    real rcutoff_max2                 = std::max(ic.rcoulomb, ic.rvdw);
+    rcutoff_max2                      = rcutoff_max2 * rcutoff_max2;
+    const real gmx_unused rCutoffCoul = ic.rcoulomb;
 
-    bEwald          = (icoul == GMX_NBKERNEL_ELEC_EWALD);
-    bEwaldLJ        = (ivdw == GMX_NBKERNEL_VDW_LJEWALD);
-
-    if (bEwald || bEwaldLJ)
+    real gmx_unused sh_ewald = 0;
+    if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald)
     {
-        sh_ewald       = ic->sh_ewald;
-        ewtab          = ic->tabq_coul_FDV0;
-        ewtabscale     = ic->tabq_scale;
-        ewtabhalfspace = half/ewtabscale;
-        tab_ewald_F_lj = ic->tabq_vdw_F;
-        tab_ewald_V_lj = ic->tabq_vdw_V;
+        sh_ewald = ic.sh_ewald;
     }
 
     /* For Ewald/PME interactions we cannot easily apply the soft-core component to
-     * reciprocal space. When we use vanilla (not switch/shift) Ewald interactions, we
-     * can apply the small trick of subtracting the _reciprocal_ space contribution
-     * in this kernel, and instead apply the free energy interaction to the 1/r
-     * (standard coulomb) interaction.
+     * reciprocal space. When we use non-switched Ewald interactions, we
+     * assume the soft-coring does not significantly affect the grid contribution
+     * and apply the soft-core only to the full 1/r (- shift) pair contribution.
      *
      * However, we cannot use this approach for switch-modified since we would then
      * effectively end up evaluating a significantly different interaction here compared to the
@@ -286,29 +394,18 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
      * things (1/r rather than short-range Ewald). For these settings, we just
      * use the traditional short-range Ewald interaction in that case.
      */
-    bConvertEwaldToCoulomb = (bEwald && (ic->coulomb_modifier != eintmodPOTSWITCH));
-    /* For now the below will always be true (since LJ-PME only works with Shift in Gromacs-5.0),
-     * but writing it this way means we stay in sync with coulomb, and it avoids future bugs.
-     */
-    bConvertLJEwaldToLJ6   = (bEwaldLJ && (ic->vdw_modifier   != eintmodPOTSWITCH));
+    GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch),
+                       "Can not apply soft-core to switched Ewald potentials");
 
-    /* We currently don't implement exclusion correction, needed with the Verlet cut-off scheme, without conversion */
-    if (fr->cutoff_scheme == ecutsVERLET &&
-        ((bEwald   && !bConvertEwaldToCoulomb) ||
-         (bEwaldLJ && !bConvertLJEwaldToLJ6)))
-    {
-        gmx_incons("Unimplemented non-bonded setup");
-    }
-
-    /* fix compiler warnings */
-    n1C   = n1V   = 0;
-    epsC  = epsV  = 0;
-    eps2C = eps2V = 0;
+    const RealType            minDistanceSquared(c_minDistanceSquared);
+    const RealType            maxRInvSix(c_maxRInvSix);
+    const RealType gmx_unused floatMin(GMX_FLOAT_MIN);
 
-    dvdl_coul  = 0;
-    dvdl_vdw   = 0;
+    RealType dvdlCoul{ zero };
+    RealType dvdlVdw{ zero };
 
     /* Lambda factor for state A, 1-lambda*/
+    real LFC[NSTATES], LFV[NSTATES];
     LFC[STATE_A] = one - lambda_coul;
     LFV[STATE_A] = one - lambda_vdw;
 
@@ -317,472 +414,617 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
     LFV[STATE_B] = lambda_vdw;
 
     /*derivative of the lambda factor for state A and B */
-    DLF[STATE_A] = -1;
-    DLF[STATE_B] = 1;
+    real DLF[NSTATES];
+    DLF[STATE_A] = -one;
+    DLF[STATE_B] = one;
 
-    for (i = 0; i < NSTATES; i++)
+    real gmx_unused lFacCoul[NSTATES], dlFacCoul[NSTATES], lFacVdw[NSTATES], dlFacVdw[NSTATES];
+    constexpr real  sc_r_power = six;
+    for (int i = 0; i < NSTATES; i++)
     {
-        lfac_coul[i]  = (lam_power == 2 ? (1-LFC[i])*(1-LFC[i]) : (1-LFC[i]));
-        dlfac_coul[i] = DLF[i]*lam_power/sc_r_power*(lam_power == 2 ? (1-LFC[i]) : 1);
-        lfac_vdw[i]   = (lam_power == 2 ? (1-LFV[i])*(1-LFV[i]) : (1-LFV[i]));
-        dlfac_vdw[i]  = DLF[i]*lam_power/sc_r_power*(lam_power == 2 ? (1-LFV[i]) : 1);
+        lFacCoul[i]  = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i]));
+        dlFacCoul[i] = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFC[i]) : 1);
+        lFacVdw[i]   = (lam_power == 2 ? (1 - LFV[i]) * (1 - LFV[i]) : (1 - LFV[i]));
+        dlFacVdw[i]  = DLF[i] * lam_power / sc_r_power * (lam_power == 2 ? (1 - LFV[i]) : 1);
     }
-    /* precalculate */
-    sigma2_def = std::cbrt(sigma6_def);
-    sigma2_min = std::cbrt(sigma6_min);
 
-    /* Ewald (not PME) table is special (icoul==enbcoulFEWALD) */
-
-    do_tab = (icoul == GMX_NBKERNEL_ELEC_CUBICSPLINETABLE ||
-              ivdw == GMX_NBKERNEL_VDW_CUBICSPLINETABLE);
-    if (do_tab)
+    // We need pointers to real for SIMD access
+    const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0];
+    real* gmx_restrict       forceRealPtr;
+    if constexpr (computeForces)
     {
-        tabscale         = kernel_data->table_elec_vdw->scale;
-        VFtab            = kernel_data->table_elec_vdw->data;
-        /* we always use the combined table here */
-        tab_elemsize     = kernel_data->table_elec_vdw->stride;
+        GMX_ASSERT(nri == 0 || !threadForceBuffer.empty(), "need a valid threadForceBuffer");
+
+        forceRealPtr = threadForceBuffer.paddedArrayRef().data()[0];
+
+        if (doShiftForces)
+        {
+            GMX_ASSERT(threadForceShiftBuffer != nullptr, "need a valid threadForceShiftBuffer");
+        }
     }
 
-    for (n = 0; (n < nri); n++)
+    const real rlistSquared = gmx::square(rlist);
+
+    bool haveExcludedPairsBeyondRlist = false;
+
+    for (int n = 0; n < nri; n++)
     {
-        int npair_within_cutoff;
-
-        npair_within_cutoff = 0;
-
-        is3              = 3*shift[n];
-        shX              = shiftvec[is3];
-        shY              = shiftvec[is3+1];
-        shZ              = shiftvec[is3+2];
-        nj0              = jindex[n];
-        nj1              = jindex[n+1];
-        ii               = iinr[n];
-        ii3              = 3*ii;
-        ix               = shX + x[ii3+0];
-        iy               = shY + x[ii3+1];
-        iz               = shZ + x[ii3+2];
-        iqA              = facel*chargeA[ii];
-        iqB              = facel*chargeB[ii];
-        ntiA             = 2*ntype*typeA[ii];
-        ntiB             = 2*ntype*typeB[ii];
-        vctot            = 0;
-        vvtot            = 0;
-        fix              = 0;
-        fiy              = 0;
-        fiz              = 0;
-
-        for (k = nj0; (k < nj1); k++)
+        bool havePairsWithinCutoff = false;
+
+        const int  is   = shift[n];
+        const real shX  = shiftvec[is][XX];
+        const real shY  = shiftvec[is][YY];
+        const real shZ  = shiftvec[is][ZZ];
+        const int  nj0  = jindex[n];
+        const int  nj1  = jindex[n + 1];
+        const int  ii   = iinr[n];
+        const int  ii3  = 3 * ii;
+        const real ix   = shX + x[ii3 + 0];
+        const real iy   = shY + x[ii3 + 1];
+        const real iz   = shZ + x[ii3 + 2];
+        const real iqA  = facel * chargeA[ii];
+        const real iqB  = facel * chargeB[ii];
+        const int  ntiA = ntype * typeA[ii];
+        const int  ntiB = ntype * typeB[ii];
+        RealType   vCTot(0);
+        RealType   vVTot(0);
+        RealType   fIX(0);
+        RealType   fIY(0);
+        RealType   fIZ(0);
+
+#if GMX_SIMD_HAVE_REAL
+        alignas(GMX_SIMD_ALIGNMENT) int            preloadIi[DataTypes::simdRealWidth];
+        alignas(GMX_SIMD_ALIGNMENT) int gmx_unused preloadIs[DataTypes::simdRealWidth];
+#else
+        int            preloadIi[DataTypes::simdRealWidth];
+        int gmx_unused preloadIs[DataTypes::simdRealWidth];
+#endif
+        for (int i = 0; i < DataTypes::simdRealWidth; i++)
         {
-            jnr              = jjnr[k];
-            j3               = 3*jnr;
-            dx               = ix - x[j3];
-            dy               = iy - x[j3+1];
-            dz               = iz - x[j3+2];
-            rsq              = dx*dx + dy*dy + dz*dz;
-
-            if (bExactCutoffAll && rsq >= rcutoff_max2)
+            preloadIi[i] = ii;
+            preloadIs[i] = shift[n];
+        }
+        IntType ii_s = gmx::load<IntType>(preloadIi);
+
+        for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
+        {
+            RealType r, rInv;
+
+#if GMX_SIMD_HAVE_REAL
+            alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIsValid[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT) real    preloadPairIncluded[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT) int32_t preloadJnr[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT) int32_t typeIndices[NSTATES][DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT) real    preloadQq[NSTATES][DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT)
+                    real gmx_unused preloadGapsysSigma6VdW[NSTATES][DataTypes::simdRealWidth];
+            alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
+#else
+            real            preloadPairIsValid[DataTypes::simdRealWidth];
+            real            preloadPairIncluded[DataTypes::simdRealWidth];
+            int             preloadJnr[DataTypes::simdRealWidth];
+            int             typeIndices[NSTATES][DataTypes::simdRealWidth];
+            real            preloadQq[NSTATES][DataTypes::simdRealWidth];
+            real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth];
+            real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth];
+            real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysScaleLinpointVdW[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysScaleLinpointCoul[DataTypes::simdRealWidth];
+            real gmx_unused preloadGapsysSigma6VdW[NSTATES][DataTypes::simdRealWidth];
+            real            preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth];
+#endif
+            for (int j = 0; j < DataTypes::simdRealWidth; j++)
+            {
+                if (k + j < nj1)
+                {
+                    preloadPairIsValid[j] = true;
+                    /* Check if this pair on the exclusions list.*/
+                    preloadPairIncluded[j]  = (nlist.excl_fep.empty() || nlist.excl_fep[k + j]);
+                    const int jnr           = jjnr[k + j];
+                    preloadJnr[j]           = jnr;
+                    typeIndices[STATE_A][j] = ntiA + typeA[jnr];
+                    typeIndices[STATE_B][j] = ntiB + typeB[jnr];
+                    preloadQq[STATE_A][j]   = iqA * chargeA[jnr];
+                    preloadQq[STATE_B][j]   = iqB * chargeB[jnr];
+
+                    for (int i = 0; i < NSTATES; i++)
+                    {
+                        if constexpr (vdwInteractionTypeIsEwald)
+                        {
+                            preloadLjPmeC6Grid[i][j] = nbfp_grid[2 * typeIndices[i][j]];
+                        }
+                        else
+                        {
+                            preloadLjPmeC6Grid[i][j] = 0;
+                        }
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
+                        {
+                            const real c6  = nbfp[2 * typeIndices[i][j]];
+                            const real c12 = nbfp[2 * typeIndices[i][j] + 1];
+                            if (c6 > 0 && c12 > 0)
+                            {
+                                /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
+                                preloadSigma6[i][j] = 0.5_real * c12 / c6;
+                                if (preloadSigma6[i][j]
+                                    < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */
+                                {
+                                    preloadSigma6[i][j] = sigma6_min;
+                                }
+                            }
+                            else
+                            {
+                                preloadSigma6[i][j] = sigma6_def;
+                            }
+                        }
+                        if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                        {
+                            const real c6  = nbfp[2 * typeIndices[i][j]];
+                            const real c12 = nbfp[2 * typeIndices[i][j] + 1];
+                            if (c6 > 0 && c12 > 0)
+                            {
+                                /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
+                                preloadGapsysSigma6VdW[i][j] = 0.5_real * c12 / c6;
+                            }
+                            else
+                            {
+                                preloadGapsysSigma6VdW[i][j] = gapsysSigma6VdW;
+                            }
+                        }
+                    }
+                    if constexpr (softcoreType == KernelSoftcoreType::Beutler)
+                    {
+                        /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
+                        const real c12A = nbfp[2 * typeIndices[STATE_A][j] + 1];
+                        const real c12B = nbfp[2 * typeIndices[STATE_B][j] + 1];
+                        if (c12A > 0 && c12B > 0)
+                        {
+                            preloadAlphaVdwEff[j]  = 0;
+                            preloadAlphaCoulEff[j] = 0;
+                        }
+                        else
+                        {
+                            preloadAlphaVdwEff[j]  = alpha_vdw;
+                            preloadAlphaCoulEff[j] = alpha_coul;
+                        }
+                    }
+                    if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                    {
+                        /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
+                        const real c12A = nbfp[2 * typeIndices[STATE_A][j] + 1];
+                        const real c12B = nbfp[2 * typeIndices[STATE_B][j] + 1];
+                        if (c12A > 0 && c12B > 0)
+                        {
+                            preloadGapsysScaleLinpointVdW[j]  = 0;
+                            preloadGapsysScaleLinpointCoul[j] = 0;
+                        }
+                        else
+                        {
+                            preloadGapsysScaleLinpointVdW[j]  = gapsysScaleLinpointVdW;
+                            preloadGapsysScaleLinpointCoul[j] = gapsysScaleLinpointCoul;
+                        }
+                    }
+                }
+                else
+                {
+                    preloadJnr[j]                     = jjnr[k];
+                    preloadPairIsValid[j]             = false;
+                    preloadPairIncluded[j]            = false;
+                    preloadAlphaVdwEff[j]             = 0;
+                    preloadAlphaCoulEff[j]            = 0;
+                    preloadGapsysScaleLinpointVdW[j]  = 0;
+                    preloadGapsysScaleLinpointCoul[j] = 0;
+
+                    typeIndices[STATE_A][j] = ntiA + typeA[jjnr[k]];
+                    typeIndices[STATE_B][j] = ntiB + typeB[jjnr[k]];
+                    for (int i = 0; i < NSTATES; i++)
+                    {
+                        preloadLjPmeC6Grid[i][j]     = 0;
+                        preloadQq[i][j]              = 0;
+                        preloadSigma6[i][j]          = 0;
+                        preloadGapsysSigma6VdW[i][j] = 0;
+                    }
+                }
+            }
+
+            RealType jx, jy, jz;
+            gmx::gatherLoadUTranspose<3>(reinterpret_cast<const real*>(x), preloadJnr, &jx, &jy, &jz);
+
+            const RealType pairIsValid   = gmx::load<RealType>(preloadPairIsValid);
+            const RealType pairIncluded  = gmx::load<RealType>(preloadPairIncluded);
+            const BoolType bPairIncluded = (pairIncluded != zero);
+            const BoolType bPairExcluded = (pairIncluded == zero && pairIsValid != zero);
+
+            const RealType dX  = ix - jx;
+            const RealType dY  = iy - jy;
+            const RealType dZ  = iz - jz;
+            RealType       rSq = dX * dX + dY * dY + dZ * dZ;
+
+            BoolType withinCutoffMask = (rSq < rcutoff_max2);
+
+            if (!gmx::anyTrue(withinCutoffMask || bPairExcluded))
             {
                 /* We save significant time by skipping all code below.
                  * Note that with soft-core interactions, the actual cut-off
                  * check might be different. But since the soft-core distance
                  * is always larger than r, checking on r here is safe.
+                 * Exclusions outside the cutoff can not be skipped as
+                 * when using Ewald: the reciprocal-space
+                 * Ewald component still needs to be subtracted.
                  */
                 continue;
             }
-            npair_within_cutoff++;
-
-            if (rsq > 0)
+            else
             {
-                /* Note that unlike in the nbnxn kernels, we do not need
-                 * to clamp the value of rsq before taking the invsqrt
-                 * to avoid NaN in the LJ calculation, since here we do
-                 * not calculate LJ interactions when C6 and C12 are zero.
-                 */
-
-                rinv         = gmx::invsqrt(rsq);
-                r            = rsq*rinv;
+                havePairsWithinCutoff = true;
             }
-            else
+
+            if (gmx::anyTrue(rlistSquared < rSq && bPairExcluded))
             {
-                /* The force at r=0 is zero, because of symmetry.
-                 * But note that the potential is in general non-zero,
-                 * since the soft-cored r will be non-zero.
-                 */
-                rinv         = 0;
-                r            = 0;
+                haveExcludedPairsBeyondRlist = true;
             }
 
-            if (sc_r_power == six)
+            const IntType  jnr_s    = gmx::load<IntType>(preloadJnr);
+            const BoolType bIiEqJnr = gmx::cvtIB2B(ii_s == jnr_s);
+
+            RealType            c6[NSTATES];
+            RealType            c12[NSTATES];
+            RealType gmx_unused sigma6[NSTATES];
+            RealType            qq[NSTATES];
+            RealType gmx_unused ljPmeC6Grid[NSTATES];
+            RealType gmx_unused alphaVdwEff;
+            RealType gmx_unused alphaCoulEff;
+            RealType gmx_unused gapsysScaleLinpointVdWEff;
+            RealType gmx_unused gapsysScaleLinpointCoulEff;
+            RealType gmx_unused gapsysSigma6VdWEff[NSTATES];
+            for (int i = 0; i < NSTATES; i++)
             {
-                rpm2             = rsq*rsq;  /* r4 */
-                rp               = rpm2*rsq; /* r6 */
+                gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]);
+                qq[i]          = gmx::load<RealType>(preloadQq[i]);
+                ljPmeC6Grid[i] = gmx::load<RealType>(preloadLjPmeC6Grid[i]);
+                if constexpr (softcoreType == KernelSoftcoreType::Beutler)
+                {
+                    sigma6[i] = gmx::load<RealType>(preloadSigma6[i]);
+                }
+                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                {
+                    gapsysSigma6VdWEff[i] = gmx::load<RealType>(preloadGapsysSigma6VdW[i]);
+                }
             }
-            else if (sc_r_power == fourtyeight)
+            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
             {
-                rp               = rsq*rsq*rsq; /* r6 */
-                rp               = rp*rp;       /* r12 */
-                rp               = rp*rp;       /* r24 */
-                rp               = rp*rp;       /* r48 */
-                rpm2             = rp/rsq;      /* r46 */
+                alphaVdwEff  = gmx::load<RealType>(preloadAlphaVdwEff);
+                alphaCoulEff = gmx::load<RealType>(preloadAlphaCoulEff);
             }
-            else
+            if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
             {
-                rp             = std::pow(r, sc_r_power);  /* not currently supported as input, but can handle it */
-                rpm2           = rp/rsq;
+                gapsysScaleLinpointVdWEff  = gmx::load<RealType>(preloadGapsysScaleLinpointVdW);
+                gapsysScaleLinpointCoulEff = gmx::load<RealType>(preloadGapsysScaleLinpointCoul);
             }
 
-            Fscal = 0;
+            // Avoid overflow of r^-12 at distances near zero
+            rSq  = gmx::max(rSq, minDistanceSquared);
+            rInv = gmx::invsqrt(rSq);
+            r    = rSq * rInv;
 
-            qq[STATE_A]      = iqA*chargeA[jnr];
-            qq[STATE_B]      = iqB*chargeB[jnr];
+            RealType gmx_unused rp, rpm2;
+            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
+            {
+                rpm2 = rSq * rSq;  /* r4 */
+                rp   = rpm2 * rSq; /* r6 */
+            }
+            else
+            {
+                /* The soft-core power p will not affect the results
+                 * with not using soft-core, so we use power of 0 which gives
+                 * the simplest math and cheapest code.
+                 */
+                rpm2 = rInv * rInv;
+                rp   = one;
+            }
 
-            tj[STATE_A]      = ntiA+2*typeA[jnr];
-            tj[STATE_B]      = ntiB+2*typeB[jnr];
+            RealType fScal(0);
 
-            if (nlist->excl_fep == nullptr || nlist->excl_fep[k])
+            /* The following block is masked to only calculate values having bPairIncluded. If
+             * bPairIncluded is true then withinCutoffMask must also be true. */
+            if (gmx::anyTrue(withinCutoffMask && bPairIncluded))
             {
-                c6[STATE_A]      = nbfp[tj[STATE_A]];
-                c6[STATE_B]      = nbfp[tj[STATE_B]];
-
-                for (i = 0; i < NSTATES; i++)
+                RealType gmx_unused fScalC[NSTATES], fScalV[NSTATES];
+                RealType            vCoul[NSTATES], vVdw[NSTATES];
+                for (int i = 0; i < NSTATES; i++)
                 {
-                    c12[i]             = nbfp[tj[i]+1];
-                    if ((c6[i] > 0) && (c12[i] > 0))
+                    fScalC[i] = zero;
+                    fScalV[i] = zero;
+                    vCoul[i]  = zero;
+                    vVdw[i]   = zero;
+
+                    RealType gmx_unused rInvC, rInvV, rC, rV, rPInvC, rPInvV;
+
+                    /* The following block is masked to require (qq[i] != 0 || c6[i] != 0 || c12[i]
+                     * != 0) in addition to bPairIncluded, which in turn requires withinCutoffMask. */
+                    BoolType nonZeroState = ((qq[i] != zero || c6[i] != zero || c12[i] != zero)
+                                             && bPairIncluded && withinCutoffMask);
+                    if (gmx::anyTrue(nonZeroState))
                     {
-                        /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
-                        sigma6[i]       = half*c12[i]/c6[i];
-                        sigma2[i]       = std::cbrt(sigma6[i]);
-                        /* should be able to get rid of cbrt call eventually.  Will require agreement on
-                           what data to store externally.  Can't be fixed without larger scale changes, so not 4.6 */
-                        if (sigma6[i] < sigma6_min)   /* for disappearing coul and vdw with soft core at the same time */
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
                         {
-                            sigma6[i] = sigma6_min;
-                            sigma2[i] = sigma2_min;
-                        }
-                    }
-                    else
-                    {
-                        sigma6[i]       = sigma6_def;
-                        sigma2[i]       = sigma2_def;
-                    }
-                    if (sc_r_power == six)
-                    {
-                        sigma_pow[i]    = sigma6[i];
-                    }
-                    else if (sc_r_power == fourtyeight)
-                    {
-                        sigma_pow[i]    = sigma6[i]*sigma6[i];       /* sigma^12 */
-                        sigma_pow[i]    = sigma_pow[i]*sigma_pow[i]; /* sigma^24 */
-                        sigma_pow[i]    = sigma_pow[i]*sigma_pow[i]; /* sigma^48 */
-                    }
-                    else
-                    {    /* not really supported as input, but in here for testing the general case*/
-                        sigma_pow[i]    = std::pow(sigma2[i], sc_r_power/2);
-                    }
-                }
+                            RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp);
+                            rPInvC           = gmx::inv(divisor);
+                            sixthRoot(rPInvC, &rInvC, &rC);
 
-                /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/
-                if ((c12[STATE_A] > 0) && (c12[STATE_B] > 0))
-                {
-                    alpha_vdw_eff    = 0;
-                    alpha_coul_eff   = 0;
-                }
-                else
-                {
-                    alpha_vdw_eff    = alpha_vdw;
-                    alpha_coul_eff   = alpha_coul;
-                }
-
-                for (i = 0; i < NSTATES; i++)
-                {
-                    FscalC[i]    = 0;
-                    FscalV[i]    = 0;
-                    Vcoul[i]     = 0;
-                    Vvdw[i]      = 0;
-
-                    /* Only spend time on A or B state if it is non-zero */
-                    if ( (qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0) )
-                    {
-                        /* this section has to be inside the loop because of the dependence on sigma_pow */
-                        rpinvC         = one/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp);
-                        rinvC          = std::pow(rpinvC, one/sc_r_power);
-                        rC             = one/rinvC;
-
-                        rpinvV         = one/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp);
-                        rinvV          = std::pow(rpinvV, one/sc_r_power);
-                        rV             = one/rinvV;
-
-                        if (do_tab)
+                            if constexpr (scLambdasOrAlphasDiffer)
+                            {
+                                RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp);
+                                rPInvV           = gmx::inv(divisor);
+                                sixthRoot(rPInvV, &rInvV, &rV);
+                            }
+                            else
+                            {
+                                /* We can avoid one expensive pow and one / operation */
+                                rPInvV = rPInvC;
+                                rInvV  = rInvC;
+                                rV     = rC;
+                            }
+                        }
+                        else
                         {
-                            rtC        = rC*tabscale;
-                            n0         = rtC;
-                            epsC       = rtC-n0;
-                            eps2C      = epsC*epsC;
-                            n1C        = tab_elemsize*n0;
-
-                            rtV        = rV*tabscale;
-                            n0         = rtV;
-                            epsV       = rtV-n0;
-                            eps2V      = epsV*epsV;
-                            n1V        = tab_elemsize*n0;
+                            rPInvC = one;
+                            rInvC  = rInv;
+                            rC     = r;
+
+                            rPInvV = one;
+                            rInvV  = rInv;
+                            rV     = r;
                         }
 
-                        /* Only process the coulomb interactions if we have charges,
-                         * and if we either include all entries in the list (no cutoff
+                        /* Only process the coulomb interactions if we either
+                         * include all entries in the list (no cutoff
                          * used in the kernel), or if we are within the cutoff.
                          */
-                        bComputeElecInteraction = !bExactElecCutoff ||
-                            ( bConvertEwaldToCoulomb && r < rcoulomb) ||
-                            (!bConvertEwaldToCoulomb && rC < rcoulomb);
-
-                        if ( (qq[i] != 0) && bComputeElecInteraction)
+                        BoolType computeElecInteraction;
+                        if constexpr (elecInteractionTypeIsEwald)
+                        {
+                            computeElecInteraction = (r < rCoulomb && qq[i] != zero && bPairIncluded);
+                        }
+                        else
+                        {
+                            computeElecInteraction = (rC < rCoulomb && qq[i] != zero && bPairIncluded);
+                        }
+                        if (gmx::anyTrue(computeElecInteraction))
                         {
-                            switch (icoul)
+                            if constexpr (elecInteractionTypeIsEwald)
                             {
-                                case GMX_NBKERNEL_ELEC_COULOMB:
-                                    /* simple cutoff */
-                                    Vcoul[i]   = qq[i]*rinvC;
-                                    FscalC[i]  = Vcoul[i];
-                                    /* The shift for the Coulomb potential is stored in
-                                     * the RF parameter c_rf, which is 0 without shift.
-                                     */
-                                    Vcoul[i]  -= qq[i]*ic->c_rf;
-                                    break;
-
-                                case GMX_NBKERNEL_ELEC_REACTIONFIELD:
-                                    /* reaction-field */
-                                    Vcoul[i]   = qq[i]*(rinvC + krf*rC*rC-crf);
-                                    FscalC[i]  = qq[i]*(rinvC - two*krf*rC*rC);
-                                    break;
-
-                                case GMX_NBKERNEL_ELEC_CUBICSPLINETABLE:
-                                    /* non-Ewald tabulated coulomb */
-                                    nnn        = n1C;
-                                    Y          = VFtab[nnn];
-                                    F          = VFtab[nnn+1];
-                                    Geps       = epsC*VFtab[nnn+2];
-                                    Heps2      = eps2C*VFtab[nnn+3];
-                                    Fp         = F+Geps+Heps2;
-                                    VV         = Y+epsC*Fp;
-                                    FF         = Fp+Geps+two*Heps2;
-                                    Vcoul[i]   = qq[i]*VV;
-                                    FscalC[i]  = -qq[i]*tabscale*FF*rC;
-                                    break;
-
-                                case GMX_NBKERNEL_ELEC_EWALD:
-                                    if (bConvertEwaldToCoulomb)
-                                    {
-                                        /* Ewald FEP is done only on the 1/r part */
-                                        Vcoul[i]   = qq[i]*(rinvC-sh_ewald);
-                                        FscalC[i]  = qq[i]*rinvC;
-                                    }
-                                    else
-                                    {
-                                        ewrt      = rC*ewtabscale;
-                                        ewitab    = static_cast<int>(ewrt);
-                                        eweps     = ewrt-ewitab;
-                                        ewitab    = 4*ewitab;
-                                        FscalC[i] = ewtab[ewitab]+eweps*ewtab[ewitab+1];
-                                        rinvcorr  = rinvC-sh_ewald;
-                                        Vcoul[i]  = qq[i]*(rinvcorr-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+FscalC[i])));
-                                        FscalC[i] = qq[i]*(rinvC-rC*FscalC[i]);
-                                    }
-                                    break;
-
-                                case GMX_NBKERNEL_ELEC_NONE:
-                                    FscalC[i]  = zero;
-                                    Vcoul[i]   = zero;
-                                    break;
-
-                                default:
-                                    gmx_incons("Invalid icoul in free energy kernel");
-                                    break;
+                                vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald);
+                                if constexpr (computeForces)
+                                {
+                                    fScalC[i] = ewaldScalarForce(qq[i], rInvC);
+                                }
+
+                                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                                {
+                                    ewaldQuadraticPotential<computeForces>(qq[i],
+                                                                           facel,
+                                                                           rC,
+                                                                           rCutoffCoul,
+                                                                           LFC[i],
+                                                                           DLF[i],
+                                                                           gapsysScaleLinpointCoulEff,
+                                                                           sh_ewald,
+                                                                           &fScalC[i],
+                                                                           &vCoul[i],
+                                                                           &dvdlCoul,
+                                                                           computeElecInteraction);
+                                }
                             }
-
-                            if (ic->coulomb_modifier == eintmodPOTSWITCH)
+                            else
                             {
-                                d                = rC - ic->rcoulomb_switch;
-                                d                = (d > zero) ? d : zero;
-                                d2               = d*d;
-                                sw               = one+d2*d*(elec_swV3+d*(elec_swV4+d*elec_swV5));
-                                dsw              = d2*(elec_swF2+d*(elec_swF3+d*elec_swF4));
-
-                                FscalC[i]        = FscalC[i]*sw - rC*Vcoul[i]*dsw;
-                                Vcoul[i]        *= sw;
+                                vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf);
+                                if constexpr (computeForces)
+                                {
+                                    fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two);
+                                }
+
+                                if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
+                                {
+                                    reactionFieldQuadraticPotential<computeForces>(
+                                            qq[i],
+                                            facel,
+                                            rC,
+                                            rCutoffCoul,
+                                            LFC[i],
+                                            DLF[i],
+                                            gapsysScaleLinpointCoulEff,
+                                            krf,
+                                            crf,
+                                            &fScalC[i],
+                                            &vCoul[i],
+                                            &dvdlCoul,
+                                            computeElecInteraction);
+                                }
+                            }
 
-                                FscalC[i]        = (rC < rcoulomb) ? FscalC[i] : zero;
-                                Vcoul[i]         = (rC < rcoulomb) ? Vcoul[i] : zero;
+                            vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction);
+                            if constexpr (computeForces)
+                            {
+                                fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction);
                             }
                         }
 
-                        /* Only process the VDW interactions if we have
-                         * some non-zero parameters, and if we either
+                        /* Only process the VDW interactions if we either
                          * include all entries in the list (no cutoff used
                          * in the kernel), or if we are within the cutoff.
                          */
-                        bComputeVdwInteraction = !bExactVdwCutoff ||
-                            ( bConvertLJEwaldToLJ6 && r < rvdw) ||
-                            (!bConvertLJEwaldToLJ6 && rV < rvdw);
-                        if ((c6[i] != 0 || c12[i] != 0) && bComputeVdwInteraction)
+                        BoolType computeVdwInteraction;
+                        if constexpr (vdwInteractionTypeIsEwald)
+                        {
+                            computeVdwInteraction =
+                                    (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
+                        }
+                        else
+                        {
+                            computeVdwInteraction =
+                                    (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded);
+                        }
+                        if (gmx::anyTrue(computeVdwInteraction))
                         {
-                            switch (ivdw)
+                            RealType rInv6;
+                            if constexpr (softcoreType == KernelSoftcoreType::Beutler)
+                            {
+                                rInv6 = rPInvV;
+                            }
+                            else
+                            {
+                                rInv6 = calculateRinv6(rInvV);
+                            }
+                            // Avoid overflow at short distance for masked exclusions and
+                            // for foreign energy calculations at a hard core end state.
+                            // Note that we should limit r^-6, and thus also r^-12, and
+                            // not only r^-12, as that could lead to erroneously low instead
+                            // of very high foreign energies.
+                            rInv6           = gmx::min(rInv6, maxRInvSix);
+                            RealType vVdw6  = calculateVdw6(c6[i], rInv6);
+                            RealType vVdw12 = calculateVdw12(c12[i], rInv6);
+
+                            vVdw[i] = lennardJonesPotential(
+                                    vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth);
+                            if constexpr (computeForces)
                             {
-                                case GMX_NBKERNEL_VDW_LENNARDJONES:
-                                    /* cutoff LJ */
-                                    if (sc_r_power == six)
-                                    {
-                                        rinv6            = rpinvV;
-                                    }
-                                    else
-                                    {
-                                        rinv6            = rinvV*rinvV;
-                                        rinv6            = rinv6*rinv6*rinv6;
-                                    }
-                                    Vvdw6            = c6[i]*rinv6;
-                                    Vvdw12           = c12[i]*rinv6*rinv6;
-
-                                    Vvdw[i]          = ( (Vvdw12 - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth
-                                                         - (Vvdw6 - c6[i]*sh_invrc6)*onesixth);
-                                    FscalV[i]        = Vvdw12 - Vvdw6;
-                                    break;
-
-                                case GMX_NBKERNEL_VDW_BUCKINGHAM:
-                                    gmx_fatal(FARGS, "Buckingham free energy not supported.");
-                                    break;
-
-                                case GMX_NBKERNEL_VDW_CUBICSPLINETABLE:
-                                    /* Table LJ */
-                                    nnn = n1V+4;
-                                    /* dispersion */
-                                    Y          = VFtab[nnn];
-                                    F          = VFtab[nnn+1];
-                                    Geps       = epsV*VFtab[nnn+2];
-                                    Heps2      = eps2V*VFtab[nnn+3];
-                                    Fp         = F+Geps+Heps2;
-                                    VV         = Y+epsV*Fp;
-                                    FF         = Fp+Geps+two*Heps2;
-                                    Vvdw[i]   += c6[i]*VV;
-                                    FscalV[i] -= c6[i]*tabscale*FF*rV;
-
-                                    /* repulsion */
-                                    Y          = VFtab[nnn+4];
-                                    F          = VFtab[nnn+5];
-                                    Geps       = epsV*VFtab[nnn+6];
-                                    Heps2      = eps2V*VFtab[nnn+7];
-                                    Fp         = F+Geps+Heps2;
-                                    VV         = Y+epsV*Fp;
-                                    FF         = Fp+Geps+two*Heps2;
-                                    Vvdw[i]   += c12[i]*VV;
-                                    FscalV[i] -= c12[i]*tabscale*FF*rV;
-                                    break;
-
-                                case GMX_NBKERNEL_VDW_LJEWALD:
-                                    if (sc_r_power == six)
-                                    {
-                                        rinv6            = rpinvV;
-                                    }
-                                    else
-                                    {
-                                        rinv6            = rinvV*rinvV;
-                                        rinv6            = rinv6*rinv6*rinv6;
-                                    }
-                                    c6grid           = nbfp_grid[tj[i]];
-
-                                    if (bConvertLJEwaldToLJ6)
-                                    {
-                                        /* cutoff LJ */
-                                        Vvdw6            = c6[i]*rinv6;
-                                        Vvdw12           = c12[i]*rinv6*rinv6;
-
-                                        Vvdw[i]          = ( (Vvdw12 - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth
-                                                             - (Vvdw6 - c6[i]*sh_invrc6 - c6grid*sh_lj_ewald)*onesixth);
-                                        FscalV[i]        = Vvdw12 - Vvdw6;
-                                    }
-                                    else
-                                    {
-                                        /* Normal LJ-PME */
-                                        ewcljrsq         = ewclj2*rV*rV;
-                                        exponent         = std::exp(-ewcljrsq);
-                                        poly             = exponent*(one + ewcljrsq + ewcljrsq*ewcljrsq*half);
-                                        vvdw_disp        = (c6[i]-c6grid*(one-poly))*rinv6;
-                                        vvdw_rep         = c12[i]*rinv6*rinv6;
-                                        FscalV[i]        = vvdw_rep - vvdw_disp - c6grid*onesixth*exponent*ewclj6;
-                                        Vvdw[i]          = (vvdw_rep - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth - (vvdw_disp - c6[i]*sh_invrc6 - c6grid*sh_lj_ewald)/six;
-                                    }
-                                    break;
-
-                                case GMX_NBKERNEL_VDW_NONE:
-                                    Vvdw[i]    = zero;
-                                    FscalV[i]  = zero;
-                                    break;
-
-                                default:
-                                    gmx_incons("Invalid ivdw in free energy kernel");
-                                    break;
+                                fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12);
                             }
 
-                            if (ic->vdw_modifier == eintmodPOTSWITCH)
+                            if constexpr (softcoreType == KernelSoftcoreType::Gapsys)
                             {
-                                d                = rV - ic->rvdw_switch;
-                                d                = (d > zero) ? d : zero;
-                                d2               = d*d;
-                                sw               = one+d2*d*(vdw_swV3+d*(vdw_swV4+d*vdw_swV5));
-                                dsw              = d2*(vdw_swF2+d*(vdw_swF3+d*vdw_swF4));
+                                lennardJonesQuadraticPotential<computeForces>(c6[i],
+                                                                              c12[i],
+                                                                              r,
+                                                                              rSq,
+                                                                              LFV[i],
+                                                                              DLF[i],
+                                                                              gapsysSigma6VdWEff[i],
+                                                                              gapsysScaleLinpointVdWEff,
+                                                                              repulsionShift,
+                                                                              dispersionShift,
+                                                                              &fScalV[i],
+                                                                              &vVdw[i],
+                                                                              &dvdlVdw,
+                                                                              computeVdwInteraction);
+                            }
 
-                                FscalV[i]        = FscalV[i]*sw - rV*Vvdw[i]*dsw;
-                                Vvdw[i]         *= sw;
+                            if constexpr (vdwInteractionTypeIsEwald)
+                            {
+                                /* Subtract the grid potential at the cut-off */
+                                vVdw[i] = vVdw[i]
+                                          + gmx::selectByMask(ewaldLennardJonesGridSubtract(
+                                                                      ljPmeC6Grid[i], shLjEwald, oneSixth),
+                                                              computeVdwInteraction);
+                            }
+
+                            if constexpr (vdwModifierIsPotSwitch)
+                            {
+                                RealType d             = rV - rVdwSwitch;
+                                BoolType zeroMask      = zero < d;
+                                BoolType potSwitchMask = rV < rVdw;
+                                d                      = gmx::selectByMask(d, zeroMask);
+                                const RealType d2      = d * d;
+                                const RealType sw =
+                                        one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5));
+
+                                if constexpr (computeForces)
+                                {
+                                    const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4));
+                                    fScalV[i]          = potSwitchScalarForceMod(
+                                            fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask);
+                                }
+                                vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask);
+                            }
 
-                                FscalV[i]  = (rV < rvdw) ? FscalV[i] : zero;
-                                Vvdw[i]    = (rV < rvdw) ? Vvdw[i] : zero;
+                            vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction);
+                            if constexpr (computeForces)
+                            {
+                                fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction);
                             }
                         }
 
-                        /* FscalC (and FscalV) now contain: dV/drC * rC
-                         * Now we multiply by rC^-p, so it will be: dV/drC * rC^1-p
-                         * Further down we first multiply by r^p-2 and then by
-                         * the vector r, which in total gives: dV/drC * (r/rC)^1-p
-                         */
-                        FscalC[i] *= rpinvC;
-                        FscalV[i] *= rpinvV;
-                    }
-                }
+                        if constexpr (computeForces)
+                        {
+                            /* fScalC (and fScalV) now contain: dV/drC * rC
+                             * Now we multiply by rC^-6, so it will be: dV/drC * rC^-5
+                             * Further down we first multiply by r^4 and then by
+                             * the vector r, which in total gives: dV/drC * (r/rC)^-5
+                             */
+                            fScalC[i] = fScalC[i] * rPInvC;
+                            fScalV[i] = fScalV[i] * rPInvV;
+                        }
+                    } // end of block requiring nonZeroState
+                }     // end for (int i = 0; i < NSTATES; i++)
 
-                /* Assemble A and B states */
-                for (i = 0; i < NSTATES; i++)
+                /* Assemble A and B states. */
+                BoolType assembleStates = (bPairIncluded && withinCutoffMask);
+                if (gmx::anyTrue(assembleStates))
                 {
-                    vctot         += LFC[i]*Vcoul[i];
-                    vvtot         += LFV[i]*Vvdw[i];
+                    for (int i = 0; i < NSTATES; i++)
+                    {
+                        vCTot = vCTot + LFC[i] * vCoul[i];
+                        vVTot = vVTot + LFV[i] * vVdw[i];
 
-                    Fscal         += LFC[i]*FscalC[i]*rpm2;
-                    Fscal         += LFV[i]*FscalV[i]*rpm2;
+                        if constexpr (computeForces)
+                        {
+                            fScal = fScal + LFC[i] * fScalC[i] * rpm2;
+                            fScal = fScal + LFV[i] * fScalV[i] * rpm2;
+                        }
 
-                    dvdl_coul     += Vcoul[i]*DLF[i] + LFC[i]*alpha_coul_eff*dlfac_coul[i]*FscalC[i]*sigma_pow[i];
-                    dvdl_vdw      += Vvdw[i]*DLF[i] + LFV[i]*alpha_vdw_eff*dlfac_vdw[i]*FscalV[i]*sigma_pow[i];
+                        if constexpr (softcoreType == KernelSoftcoreType::Beutler)
+                        {
+                            dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]
+                                       + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i];
+                            dvdlVdw = dvdlVdw + vVdw[i] * DLF[i]
+                                      + LFV[i] * alphaVdwEff * dlFacVdw[i] * fScalV[i] * sigma6[i];
+                        }
+                        else
+                        {
+                            dvdlCoul = dvdlCoul + vCoul[i] * DLF[i];
+                            dvdlVdw  = dvdlVdw + vVdw[i] * DLF[i];
+                        }
+                    }
                 }
-            }
-            else if (icoul == GMX_NBKERNEL_ELEC_REACTIONFIELD)
+            } // end of block requiring bPairIncluded && withinCutoffMask
+            /* In the following block bPairIncluded should be false in the masks. */
+            if (icoul == NbkernelElecType::ReactionField)
             {
-                /* For excluded pairs, which are only in this pair list when
-                 * using the Verlet scheme, we don't use soft-core.
-                 * The group scheme also doesn't soft-core for these.
-                 * As there is no singularity, there is no need for soft-core.
-                 */
-                VV = krf*rsq - crf;
-                FF = -two*krf;
+                const BoolType computeReactionField = bPairExcluded;
 
-                if (ii == jnr)
+                if (gmx::anyTrue(computeReactionField))
                 {
-                    VV *= half;
-                }
+                    /* For excluded pairs we don't use soft-core.
+                     * As there is no singularity, there is no need for soft-core.
+                     */
+                    const RealType FF = -two * krf;
+                    RealType       VV = krf * rSq - crf;
 
-                for (i = 0; i < NSTATES; i++)
-                {
-                    vctot      += LFC[i]*qq[i]*VV;
-                    Fscal      += LFC[i]*qq[i]*FF;
-                    dvdl_coul  += DLF[i]*qq[i]*VV;
+                    /* If ii == jnr the i particle (ii) has itself (jnr)
+                     * in its neighborlist. This corresponds to a self-interaction
+                     * that will occur twice. Scale it down by 50% to only include
+                     * it once.
+                     */
+                    VV = VV * gmx::blend(one, half, bIiEqJnr);
+
+                    for (int i = 0; i < NSTATES; i++)
+                    {
+                        vCTot = vCTot + gmx::selectByMask(LFC[i] * qq[i] * VV, computeReactionField);
+                        fScal = fScal + gmx::selectByMask(LFC[i] * qq[i] * FF, computeReactionField);
+                        dvdlCoul = dvdlCoul + gmx::selectByMask(DLF[i] * qq[i] * VV, computeReactionField);
+                    }
                 }
             }
 
-            if (bConvertEwaldToCoulomb && ( !bExactElecCutoff || r < rcoulomb ) )
+            const BoolType computeElecEwaldInteraction = (bPairExcluded || r < rCoulomb);
+            if (elecInteractionTypeIsEwald && gmx::anyTrue(computeElecEwaldInteraction))
             {
                 /* See comment in the preamble. When using Ewald interactions
                  * (unless we use a switch modifier) we subtract the reciprocal-space
@@ -792,39 +1034,39 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                  * the softcore to the entire electrostatic interaction,
                  * including the reciprocal-space component.
                  */
-                real v_lr, f_lr;
+                RealType v_lr, f_lr;
 
-                ewrt      = r*ewtabscale;
-                ewitab    = static_cast<int>(ewrt);
-                eweps     = ewrt-ewitab;
-                ewitab    = 4*ewitab;
-                f_lr      = ewtab[ewitab]+eweps*ewtab[ewitab+1];
-                v_lr      = (ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+f_lr));
-                f_lr     *= rinv;
+                pmeCoulombCorrectionVF<computeForces>(rSq, ewaldBeta, &v_lr, &f_lr);
+                if constexpr (computeForces)
+                {
+                    f_lr = f_lr * rInv * rInv;
+                }
 
                 /* Note that any possible Ewald shift has already been applied in
                  * the normal interaction part above.
                  */
 
-                if (ii == jnr)
-                {
-                    /* If we get here, the i particle (ii) has itself (jnr)
-                     * in its neighborlist. This can only happen with the Verlet
-                     * scheme, and corresponds to a self-interaction that will
-                     * occur twice. Scale it down by 50% to only include it once.
-                     */
-                    v_lr *= half;
-                }
+                /* If ii == jnr the i particle (ii) has itself (jnr)
+                 * in its neighborlist. This corresponds to a self-interaction
+                 * that will occur twice. Scale it down by 50% to only include
+                 * it once.
+                 */
+                v_lr = v_lr * gmx::blend(one, half, bIiEqJnr);
 
-                for (i = 0; i < NSTATES; i++)
+                for (int i = 0; i < NSTATES; i++)
                 {
-                    vctot      -= LFC[i]*qq[i]*v_lr;
-                    Fscal      -= LFC[i]*qq[i]*f_lr;
-                    dvdl_coul  -= (DLF[i]*qq[i])*v_lr;
+                    vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction);
+                    if constexpr (computeForces)
+                    {
+                        fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction);
+                    }
+                    dvdlCoul = dvdlCoul
+                               - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction);
                 }
             }
 
-            if (bConvertLJEwaldToLJ6 && (!bExactVdwCutoff || r < rvdw))
+            const BoolType computeVdwEwaldInteraction = (bPairExcluded || r < rVdw);
+            if (vdwInteractionTypeIsEwald && gmx::anyTrue(computeVdwEwaldInteraction))
             {
                 /* See comment in the preamble. When using LJ-Ewald interactions
                  * (unless we use a switch modifier) we subtract the reciprocal-space
@@ -834,111 +1076,344 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                  * the softcore to the entire VdW interaction,
                  * including the reciprocal-space component.
                  */
-                /* We could also use the analytical form here
-                 * iso a table, but that can cause issues for
-                 * r close to 0 for non-interacting pairs.
-                 */
-                real rs, frac, f_lr;
-                int  ri;
-
-                rs     = rsq*rinv*ewtabscale;
-                ri     = static_cast<int>(rs);
-                frac   = rs - ri;
-                f_lr   = (1 - frac)*tab_ewald_F_lj[ri] + frac*tab_ewald_F_lj[ri+1];
-                /* TODO: Currently the Ewald LJ table does not contain
-                 * the factor 1/6, we should add this.
-                 */
-                FF     = f_lr*rinv/six;
-                VV     = (tab_ewald_V_lj[ri] - ewtabhalfspace*frac*(tab_ewald_F_lj[ri] + f_lr))/six;
 
-                if (ii == jnr)
-                {
-                    /* If we get here, the i particle (ii) has itself (jnr)
-                     * in its neighborlist. This can only happen with the Verlet
-                     * scheme, and corresponds to a self-interaction that will
-                     * occur twice. Scale it down by 50% to only include it once.
-                     */
-                    VV *= half;
-                }
+                RealType v_lr, f_lr;
+                pmeLJCorrectionVF<computeForces>(
+                        rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr);
+                v_lr = v_lr * oneSixth;
 
-                for (i = 0; i < NSTATES; i++)
+                for (int i = 0; i < NSTATES; i++)
                 {
-                    c6grid      = nbfp_grid[tj[i]];
-                    vvtot      += LFV[i]*c6grid*VV;
-                    Fscal      += LFV[i]*c6grid*FF;
-                    dvdl_vdw   += (DLF[i]*c6grid)*VV;
+                    vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
+                    if constexpr (computeForces)
+                    {
+                        fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction);
+                    }
+                    dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction);
                 }
             }
 
-            if (bDoForces)
+            if (computeForces && gmx::anyTrue(fScal != zero))
             {
-                tx         = Fscal*dx;
-                ty         = Fscal*dy;
-                tz         = Fscal*dz;
-                fix        = fix + tx;
-                fiy        = fiy + ty;
-                fiz        = fiz + tz;
-                /* OpenMP atomics are expensive, but this kernels is also
-                 * expensive, so we can take this hit, instead of using
-                 * thread-local output buffers and extra reduction.
-                 *
-                 * All the OpenMP regions in this file are trivial and should
-                 * not throw, so no need for try/catch.
-                 */
-#pragma omp atomic
-                f[j3]     -= tx;
-#pragma omp atomic
-                f[j3+1]   -= ty;
-#pragma omp atomic
-                f[j3+2]   -= tz;
+                const RealType tX = fScal * dX;
+                const RealType tY = fScal * dY;
+                const RealType tZ = fScal * dZ;
+                fIX               = fIX + tX;
+                fIY               = fIY + tY;
+                fIZ               = fIZ + tZ;
+
+                gmx::transposeScatterDecrU<3>(forceRealPtr, preloadJnr, tX, tY, tZ);
             }
-        }
+        } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth)
 
-        /* The atomics below are expensive with many OpenMP threads.
-         * Here unperturbed i-particles will usually only have a few
-         * (perturbed) j-particles in the list. Thus with a buffered list
-         * we can skip a significant number of i-reductions with a check.
-         */
-        if (npair_within_cutoff > 0)
+        if (havePairsWithinCutoff)
         {
-            if (bDoForces)
-            {
-#pragma omp atomic
-                f[ii3]        += fix;
-#pragma omp atomic
-                f[ii3+1]      += fiy;
-#pragma omp atomic
-                f[ii3+2]      += fiz;
-            }
-            if (bDoShiftForces)
+            if constexpr (computeForces)
             {
-#pragma omp atomic
-                fshift[is3]   += fix;
-#pragma omp atomic
-                fshift[is3+1] += fiy;
-#pragma omp atomic
-                fshift[is3+2] += fiz;
+                gmx::transposeScatterIncrU<3>(forceRealPtr, preloadIi, fIX, fIY, fIZ);
+
+                if (doShiftForces)
+                {
+                    gmx::transposeScatterIncrU<3>(
+                            reinterpret_cast<real*>(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ);
+                }
             }
-            if (bDoPotential)
+            if (doPotential)
             {
-                ggid               = gid[n];
-#pragma omp atomic
-                Vc[ggid]          += vctot;
-#pragma omp atomic
-                Vv[ggid]          += vvtot;
+                int ggid = gid[n];
+                threadVc[ggid] += gmx::reduce(vCTot);
+                threadVv[ggid] += gmx::reduce(vVTot);
             }
         }
-    }
+    } // end for (int n = 0; n < nri; n++)
 
-#pragma omp atomic
-    dvdl[efptCOUL]     += dvdl_coul;
- #pragma omp atomic
-    dvdl[efptVDW]      += dvdl_vdw;
+    if (gmx::anyTrue(dvdlCoul != zero))
+    {
+        threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)] += gmx::reduce(dvdlCoul);
+    }
+    if (gmx::anyTrue(dvdlVdw != zero))
+    {
+        threadDvdl[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)] += gmx::reduce(dvdlVdw);
+    }
 
     /* Estimate flops, average for free energy stuff:
      * 12  flops per outer iteration
      * 150 flops per inner iteration
+     * TODO: Update the number of flops and/or use different counts for different code paths.
      */
-#pragma omp atomic
-    inc_nrnb(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist->nri*12 + nlist->jindex[n]*150);
+    atomicNrnbIncrement(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist.nri * 12 + nlist.jindex[nri] * 150);
+
+    if (haveExcludedPairsBeyondRlist > 0)
+    {
+        gmx_fatal(FARGS,
+                  "There are perturbed non-bonded pair interactions beyond the pair-list cutoff "
+                  "of %g nm, which is not supported. This can happen because the system is "
+                  "unstable or because intra-molecular interactions at long distances are "
+                  "excluded. If the "
+                  "latter is the case, you can try to increase nstlist or rlist to avoid this."
+                  "The error is likely triggered by the use of couple-intramol=no "
+                  "and the maximal distance in the decoupled molecule exceeding rlist.",
+                  rlist);
+    }
+}
+
+typedef void (*KernelFunction)(const t_nblist&                                  nlist,
+                               const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
+                               const int                                        ntype,
+                               const real                                       rlist,
+                               const interaction_const_t&                       ic,
+                               gmx::ArrayRef<const gmx::RVec>                   shiftvec,
+                               gmx::ArrayRef<const real>                        nbfp,
+                               gmx::ArrayRef<const real>                        nbfp_grid,
+                               gmx::ArrayRef<const real>                        chargeA,
+                               gmx::ArrayRef<const real>                        chargeB,
+                               gmx::ArrayRef<const int>                         typeA,
+                               gmx::ArrayRef<const int>                         typeB,
+                               int                                              flags,
+                               gmx::ArrayRef<const real>                        lambda,
+                               t_nrnb* gmx_restrict                             nrnb,
+                               gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
+                               rvec*               threadForceShiftBuffer,
+                               gmx::ArrayRef<real> threadVc,
+                               gmx::ArrayRef<real> threadVv,
+                               gmx::ArrayRef<real> threadDvdl);
+
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch, bool computeForces>
+static KernelFunction dispatchKernelOnUseSimd(const bool useSimd)
+{
+    if (useSimd)
+    {
+#if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS
+        return (nb_free_energy_kernel<SimdDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
+#else
+        return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
+#endif
+    }
+    else
+    {
+        return (nb_free_energy_kernel<ScalarDataTypes, softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces>);
+    }
+}
+
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald, bool vdwModifierIsPotSwitch>
+static KernelFunction dispatchKernelOnComputeForces(const bool computeForces, const bool useSimd)
+{
+    if (computeForces)
+    {
+        return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, true>(
+                useSimd));
+    }
+    else
+    {
+        return (dispatchKernelOnUseSimd<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, false>(
+                useSimd));
+    }
+}
+
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald, bool elecInteractionTypeIsEwald>
+static KernelFunction dispatchKernelOnVdwModifier(const bool vdwModifierIsPotSwitch,
+                                                  const bool computeForces,
+                                                  const bool useSimd)
+{
+    if (vdwModifierIsPotSwitch)
+    {
+        return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, true>(
+                computeForces, useSimd));
+    }
+    else
+    {
+        return (dispatchKernelOnComputeForces<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, false>(
+                computeForces, useSimd));
+    }
+}
+
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer, bool vdwInteractionTypeIsEwald>
+static KernelFunction dispatchKernelOnElecInteractionType(const bool elecInteractionTypeIsEwald,
+                                                          const bool vdwModifierIsPotSwitch,
+                                                          const bool computeForces,
+                                                          const bool useSimd)
+{
+    if (elecInteractionTypeIsEwald)
+    {
+        return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, true>(
+                vdwModifierIsPotSwitch, computeForces, useSimd));
+    }
+    else
+    {
+        return (dispatchKernelOnVdwModifier<softcoreType, scLambdasOrAlphasDiffer, vdwInteractionTypeIsEwald, false>(
+                vdwModifierIsPotSwitch, computeForces, useSimd));
+    }
+}
+
+template<KernelSoftcoreType softcoreType, bool scLambdasOrAlphasDiffer>
+static KernelFunction dispatchKernelOnVdwInteractionType(const bool vdwInteractionTypeIsEwald,
+                                                         const bool elecInteractionTypeIsEwald,
+                                                         const bool vdwModifierIsPotSwitch,
+                                                         const bool computeForces,
+                                                         const bool useSimd)
+{
+    if (vdwInteractionTypeIsEwald)
+    {
+        return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, true>(
+                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
+    }
+    else
+    {
+        return (dispatchKernelOnElecInteractionType<softcoreType, scLambdasOrAlphasDiffer, false>(
+                elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
+    }
+}
+
+template<KernelSoftcoreType softcoreType>
+static KernelFunction dispatchKernelOnScLambdasOrAlphasDifference(const bool scLambdasOrAlphasDiffer,
+                                                                  const bool vdwInteractionTypeIsEwald,
+                                                                  const bool elecInteractionTypeIsEwald,
+                                                                  const bool vdwModifierIsPotSwitch,
+                                                                  const bool computeForces,
+                                                                  const bool useSimd)
+{
+    if (scLambdasOrAlphasDiffer)
+    {
+        return (dispatchKernelOnVdwInteractionType<softcoreType, true>(
+                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
+    }
+    else
+    {
+        return (dispatchKernelOnVdwInteractionType<softcoreType, false>(
+                vdwInteractionTypeIsEwald, elecInteractionTypeIsEwald, vdwModifierIsPotSwitch, computeForces, useSimd));
+    }
+}
+
+static KernelFunction dispatchKernel(const bool                 scLambdasOrAlphasDiffer,
+                                     const bool                 vdwInteractionTypeIsEwald,
+                                     const bool                 elecInteractionTypeIsEwald,
+                                     const bool                 vdwModifierIsPotSwitch,
+                                     const bool                 computeForces,
+                                     const bool                 useSimd,
+                                     const interaction_const_t& ic)
+{
+    const auto& scParams = *ic.softCoreParameters;
+    if (scParams.softcoreType == SoftcoreType::Beutler)
+    {
+        if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
+        {
+            return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
+                    scLambdasOrAlphasDiffer,
+                    vdwInteractionTypeIsEwald,
+                    elecInteractionTypeIsEwald,
+                    vdwModifierIsPotSwitch,
+                    computeForces,
+                    useSimd));
+        }
+        return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Beutler>(
+                scLambdasOrAlphasDiffer,
+                vdwInteractionTypeIsEwald,
+                elecInteractionTypeIsEwald,
+                vdwModifierIsPotSwitch,
+                computeForces,
+                useSimd));
+    }
+    else // Gapsys
+    {
+        if (scParams.gapsysScaleLinpointCoul == 0 && scParams.gapsysScaleLinpointVdW == 0)
+        {
+            return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::None>(
+                    scLambdasOrAlphasDiffer,
+                    vdwInteractionTypeIsEwald,
+                    elecInteractionTypeIsEwald,
+                    vdwModifierIsPotSwitch,
+                    computeForces,
+                    useSimd));
+        }
+        return (dispatchKernelOnScLambdasOrAlphasDifference<KernelSoftcoreType::Gapsys>(
+                scLambdasOrAlphasDiffer,
+                vdwInteractionTypeIsEwald,
+                elecInteractionTypeIsEwald,
+                vdwModifierIsPotSwitch,
+                computeForces,
+                useSimd));
+    }
+}
+
+
+void gmx_nb_free_energy_kernel(const t_nblist&                                  nlist,
+                               const gmx::ArrayRefWithPadding<const gmx::RVec>& coords,
+                               const bool                                       useSimd,
+                               const int                                        ntype,
+                               const real                                       rlist,
+                               const interaction_const_t&                       ic,
+                               gmx::ArrayRef<const gmx::RVec>                   shiftvec,
+                               gmx::ArrayRef<const real>                        nbfp,
+                               gmx::ArrayRef<const real>                        nbfp_grid,
+                               gmx::ArrayRef<const real>                        chargeA,
+                               gmx::ArrayRef<const real>                        chargeB,
+                               gmx::ArrayRef<const int>                         typeA,
+                               gmx::ArrayRef<const int>                         typeB,
+                               int                                              flags,
+                               gmx::ArrayRef<const real>                        lambda,
+                               t_nrnb*                                          nrnb,
+                               gmx::ArrayRefWithPadding<gmx::RVec>              threadForceBuffer,
+                               rvec*               threadForceShiftBuffer,
+                               gmx::ArrayRef<real> threadVc,
+                               gmx::ArrayRef<real> threadVv,
+                               gmx::ArrayRef<real> threadDvdl)
+{
+    GMX_ASSERT(EEL_PME_EWALD(ic.eeltype) || ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype),
+               "Unsupported eeltype with free energy");
+    GMX_ASSERT(ic.softCoreParameters, "We need soft-core parameters");
+
+    // Not all SIMD implementations need padding, but we provide padding anyhow so we can assert
+    GMX_ASSERT(!GMX_SIMD_HAVE_REAL || threadForceBuffer.empty()
+                       || threadForceBuffer.size() > threadForceBuffer.unpaddedArrayRef().ssize(),
+               "We need actual padding with at least one element for SIMD scatter operations");
+
+    const auto& scParams                   = *ic.softCoreParameters;
+    const bool  vdwInteractionTypeIsEwald  = (EVDW_PME(ic.vdwtype));
+    const bool  elecInteractionTypeIsEwald = (EEL_PME_EWALD(ic.eeltype));
+    const bool  vdwModifierIsPotSwitch     = (ic.vdw_modifier == InteractionModifiers::PotSwitch);
+    const bool  computeForces              = ((flags & GMX_NONBONDED_DO_FORCE) != 0);
+    bool        scLambdasOrAlphasDiffer    = true;
+
+    if (scParams.alphaCoulomb == 0 && scParams.alphaVdw == 0)
+    {
+        scLambdasOrAlphasDiffer = false;
+    }
+    else
+    {
+        if (lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Coul)]
+                    == lambda[static_cast<int>(FreeEnergyPerturbationCouplingType::Vdw)]
+            && scParams.alphaCoulomb == scParams.alphaVdw)
+        {
+            scLambdasOrAlphasDiffer = false;
+        }
+    }
+
+    KernelFunction kernelFunc;
+    kernelFunc = dispatchKernel(scLambdasOrAlphasDiffer,
+                                vdwInteractionTypeIsEwald,
+                                elecInteractionTypeIsEwald,
+                                vdwModifierIsPotSwitch,
+                                computeForces,
+                                useSimd,
+                                ic);
+    kernelFunc(nlist,
+               coords,
+               ntype,
+               rlist,
+               ic,
+               shiftvec,
+               nbfp,
+               nbfp_grid,
+               chargeA,
+               chargeB,
+               typeA,
+               typeB,
+               flags,
+               lambda,
+               nrnb,
+               threadForceBuffer,
+               threadForceShiftBuffer,
+               threadVc,
+               threadVv,
+               threadDvdl);
 }