From c8888c9f87f6cbc4be65a815afe9041a993fb689 Mon Sep 17 00:00:00 2001 From: Magnus Lundborg Date: Wed, 29 Sep 2021 09:16:19 +0000 Subject: [PATCH] SIMD support for nonbonded free-energy kernels --- docs/release-notes/2022/major/highlights.rst | 2 + docs/release-notes/2022/major/performance.rst | 13 + .../gmxlib/nonbonded/nb_free_energy.cpp | 1021 +++++++++-------- src/gromacs/gmxlib/nonbonded/nb_free_energy.h | 44 +- .../gmxlib/nonbonded/tests/nb_free_energy.cpp | 11 +- src/gromacs/mdlib/sim_util.cpp | 42 +- src/gromacs/nbnxm/CMakeLists.txt | 3 +- src/gromacs/nbnxm/freeenergydispatch.cpp | 462 ++++++++ src/gromacs/nbnxm/freeenergydispatch.h | 112 ++ src/gromacs/nbnxm/kerneldispatch.cpp | 166 --- src/gromacs/nbnxm/nbnxm.cpp | 1 - src/gromacs/nbnxm/nbnxm.h | 50 +- src/gromacs/nbnxm/nbnxm_setup.cpp | 7 +- src/gromacs/timing/wallcycle.cpp | 1 + src/gromacs/timing/wallcycle.h | 1 + 15 files changed, 1232 insertions(+), 704 deletions(-) create mode 100644 src/gromacs/nbnxm/freeenergydispatch.cpp create mode 100644 src/gromacs/nbnxm/freeenergydispatch.h diff --git a/docs/release-notes/2022/major/highlights.rst b/docs/release-notes/2022/major/highlights.rst index 7be8d63f48..f9adc74334 100644 --- a/docs/release-notes/2022/major/highlights.rst +++ b/docs/release-notes/2022/major/highlights.rst @@ -12,6 +12,8 @@ several new features are available for running simulations. We are extremely interested in your feedback on how well the new release works on your simulations and hardware. The new features are: +* Free-energy kernels are accelerated using SIMD, which make free-energy + calculations up to three times as fast when using GPUs * New transformation pull coordinate allows arbibrary mathematical transformations of one of more other pull coordinates * Cool quotes music play list diff --git a/docs/release-notes/2022/major/performance.rst b/docs/release-notes/2022/major/performance.rst index 4e1f6baffc..580ef631a3 100644 --- a/docs/release-notes/2022/major/performance.rst +++ b/docs/release-notes/2022/major/performance.rst @@ -13,3 +13,16 @@ Dynamic pairlist generation for energy minimization With energy minimization, the pairlist, and domain decomposition when running in parallel, is now performed when at least one atom has moved more than the half the pairlist buffer size. The pairlist used to be constructed every step. + +Nonbonded free-energy kernels use SIMD +"""""""""""""""""""""""""""""""""""""" + +Free energy calculation performance is improved by making the nonbonded free-energy +kernels SIMD accelerated. On AVX2-256 these kernels are 4 to 8 times as fast. +This should give a noticeable speed-up for most systems, especially if the +perturbed interaction calculations were a bottleneck. This is particularly the +case when using GPUs, where the performance improvement of free-energy runs is +up to a factor of 3. + +:issue:`2875` +:issue:`742` diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp b/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp index d73bc7fb0e..45e09c25bc 100644 --- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp +++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.cpp @@ -42,11 +42,13 @@ #include "config.h" #include +#include #include #include "gromacs/gmxlib/nrnb.h" #include "gromacs/gmxlib/nonbonded/nonbonded.h" +#include "gromacs/math/arrayrefwithpadding.h" #include "gromacs/math/functions.h" #include "gromacs/math/vec.h" #include "gromacs/mdtypes/forceoutput.h" @@ -55,7 +57,9 @@ #include "gromacs/mdtypes/md_enums.h" #include "gromacs/mdtypes/mdatom.h" #include "gromacs/mdtypes/nblist.h" +#include "gromacs/pbcutil/ishift.h" #include "gromacs/simd/simd.h" +#include "gromacs/simd/simd_math.h" #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/arrayref.h" @@ -63,29 +67,69 @@ //! Scalar (non-SIMD) data types. struct ScalarDataTypes { - using RealType = real; //!< The data type to use as real. - using IntType = int; //!< The data type to use as int. - static constexpr int simdRealWidth = 1; //!< The width of the RealType. - static constexpr int simdIntWidth = 1; //!< The width of the IntType. + using RealType = real; //!< The data type to use as real. + using IntType = int; //!< The data type to use as int. + using BoolType = bool; //!< The data type to use as bool for real value comparison. + static constexpr int simdRealWidth = 1; //!< The width of the RealType. + static constexpr int simdIntWidth = 1; //!< The width of the IntType. }; #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS //! SIMD data types. struct SimdDataTypes { - using RealType = gmx::SimdReal; //!< The data type to use as real. - using IntType = gmx::SimdInt32; //!< The data type to use as int. - static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH; //!< The width of the RealType. - static constexpr int simdIntWidth = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType. + using RealType = gmx::SimdReal; //!< The data type to use as real. + using IntType = gmx::SimdInt32; //!< The data type to use as int. + using BoolType = gmx::SimdBool; //!< The data type to use as bool for real value comparison. + static constexpr int simdRealWidth = GMX_SIMD_REAL_WIDTH; //!< The width of the RealType. +# if GMX_SIMD_HAVE_DOUBLE && GMX_DOUBLE + static constexpr int simdIntWidth = GMX_SIMD_DINT32_WIDTH; //!< The width of the IntType. +# else + static constexpr int simdIntWidth = GMX_SIMD_FINT32_WIDTH; //!< The width of the IntType. +# endif }; #endif +template +static inline void +pmeCoulombCorrectionVF(const RealType rSq, const real beta, RealType* pot, RealType* force, const BoolType mask) +{ + const RealType brsq = gmx::selectByMask(rSq * beta * beta, mask); + *force = -brsq * beta * gmx::pmeForceCorrection(brsq); + *pot = beta * gmx::pmePotentialCorrection(brsq); +} + +template +static inline void pmeLJCorrectionVF(const RealType rInv, + const RealType rSq, + const real ewaldLJCoeffSq, + const real ewaldLJCoeffSixDivSix, + RealType* pot, + RealType* force, + const BoolType mask, + const BoolType bIiEqJnr) +{ + // We mask rInv to get zero force and potential for masked out pair interactions + const RealType rInvSq = gmx::selectByMask(rInv * rInv, mask); + const RealType rInvSix = rInvSq * rInvSq * rInvSq; + // Mask rSq to avoid underflow in exp() + const RealType coeffSqRSq = ewaldLJCoeffSq * gmx::selectByMask(rSq, mask); + const RealType expNegCoeffSqRSq = gmx::exp(-coeffSqRSq); + const RealType poly = 1.0_real + coeffSqRSq + 0.5_real * coeffSqRSq * coeffSqRSq; + *force = rInvSix - expNegCoeffSqRSq * (rInvSix * poly + ewaldLJCoeffSixDivSix); + *force = *force * rInvSq; + // The self interaction is the limit for r -> 0 which we need to compute separately + *pot = gmx::blend( + rInvSix * (1.0_real - expNegCoeffSqRSq * poly), 0.5_real * ewaldLJCoeffSixDivSix, bIiEqJnr); +} + //! Computes r^(1/p) and 1/r^(1/p) for the standard p=6 -template -static inline void pthRoot(const RealType r, RealType* pthRoot, RealType* invPthRoot) +template +static inline void pthRoot(const RealType r, RealType* pthRoot, RealType* invPthRoot, const BoolType mask) { - *invPthRoot = gmx::invsqrt(std::cbrt(r)); - *pthRoot = 1 / (*invPthRoot); + RealType cbrtRes = gmx::cbrt(r); + *invPthRoot = gmx::maskzInvsqrt(cbrtRes, mask); + *pthRoot = gmx::maskzInv(*invPthRoot, mask); } template @@ -159,65 +203,56 @@ static inline RealType lennardJonesPotential(const RealType v6, } /* Ewald LJ */ -static inline real ewaldLennardJonesGridSubtract(const real c6grid, const real potentialShift, const real oneSixth) +template +static inline RealType ewaldLennardJonesGridSubtract(const RealType c6grid, + const real potentialShift, + const real oneSixth) { return (c6grid * potentialShift * oneSixth); } /* LJ Potential switch */ -template +template static inline RealType potSwitchScalarForceMod(const RealType fScalarInp, const RealType potential, const RealType sw, const RealType r, - const RealType rVdw, const RealType dsw, - const real zero) + const BoolType mask) { - if (r < rVdw) - { - real fScalar = fScalarInp * sw - r * potential * dsw; - return (fScalar); - } - return (zero); + /* The mask should select on rV < rVdw */ + return (gmx::selectByMask(fScalarInp * sw - r * potential * dsw, mask)); } -template -static inline RealType potSwitchPotentialMod(const RealType potentialInp, - const RealType sw, - const RealType r, - const RealType rVdw, - const real zero) +template +static inline RealType potSwitchPotentialMod(const RealType potentialInp, const RealType sw, const BoolType mask) { - if (r < rVdw) - { - real potential = potentialInp * sw; - return (potential); - } - return (zero); + /* The mask should select on rV < rVdw */ + return (gmx::selectByMask(potentialInp * sw, mask)); } //! Templated free-energy non-bonded kernel template -static void nb_free_energy_kernel(const t_nblist& nlist, - gmx::ArrayRef coords, - gmx::ForceWithShiftForces* forceWithShiftForces, - const int ntype, - const real rlist, - const interaction_const_t& ic, - gmx::ArrayRef shiftvec, - gmx::ArrayRef nbfp, - gmx::ArrayRef nbfp_grid, - gmx::ArrayRef chargeA, - gmx::ArrayRef chargeB, - gmx::ArrayRef typeA, - gmx::ArrayRef typeB, - int flags, - gmx::ArrayRef lambda, - gmx::ArrayRef dvdl, - gmx::ArrayRef energygrp_elec, - gmx::ArrayRef energygrp_vdw, - t_nrnb* gmx_restrict nrnb) +static void nb_free_energy_kernel(const t_nblist& nlist, + const gmx::ArrayRefWithPadding& coords, + const int ntype, + const real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef gmx_unused nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + int flags, + gmx::ArrayRef lambda, + t_nrnb* gmx_restrict nrnb, + gmx::RVec* threadForceBuffer, + rvec* threadForceShiftBuffer, + gmx::ArrayRef threadVc, + gmx::ArrayRef threadVv, + gmx::ArrayRef threadDvdl) { #define STATE_A 0 #define STATE_B 1 @@ -225,15 +260,15 @@ static void nb_free_energy_kernel(const t_nblist& nlist, using RealType = typename DataTypes::RealType; using IntType = typename DataTypes::IntType; + using BoolType = typename DataTypes::BoolType; - /* FIXME: How should these be handled with SIMD? */ - constexpr real oneTwelfth = 1.0 / 12.0; - constexpr real oneSixth = 1.0 / 6.0; - constexpr real zero = 0.0; - constexpr real half = 0.5; - constexpr real one = 1.0; - constexpr real two = 2.0; - constexpr real six = 6.0; + constexpr real oneTwelfth = 1.0_real / 12.0_real; + constexpr real oneSixth = 1.0_real / 6.0_real; + constexpr real zero = 0.0_real; + constexpr real half = 0.5_real; + constexpr real one = 1.0_real; + constexpr real two = 2.0_real; + constexpr real six = 6.0_real; // Extract pair list data const int nri = nlist.nri; @@ -243,47 +278,56 @@ static void nb_free_energy_kernel(const t_nblist& nlist, gmx::ArrayRef shift = nlist.shift; gmx::ArrayRef gid = nlist.gid; - const real lambda_coul = lambda[static_cast(FreeEnergyPerturbationCouplingType::Coul)]; - const real lambda_vdw = lambda[static_cast(FreeEnergyPerturbationCouplingType::Vdw)]; - const auto& scParams = *ic.softCoreParameters; - const real alpha_coul = scParams.alphaCoulomb; - const real alpha_vdw = scParams.alphaVdw; - const real lam_power = scParams.lambdaPower; - const real sigma6_def = scParams.sigma6WithInvalidSigma; - const real sigma6_min = scParams.sigma6Minimum; - const bool doForces = ((flags & GMX_NONBONDED_DO_FORCE) != 0); - const bool doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0); - const bool doPotential = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0); + const real lambda_coul = lambda[static_cast(FreeEnergyPerturbationCouplingType::Coul)]; + const real lambda_vdw = lambda[static_cast(FreeEnergyPerturbationCouplingType::Vdw)]; + const auto& scParams = *ic.softCoreParameters; + const real gmx_unused alpha_coul = scParams.alphaCoulomb; + const real gmx_unused alpha_vdw = scParams.alphaVdw; + const real lam_power = scParams.lambdaPower; + const real gmx_unused sigma6_def = scParams.sigma6WithInvalidSigma; + const real gmx_unused sigma6_min = scParams.sigma6Minimum; + const bool doForces = ((flags & GMX_NONBONDED_DO_FORCE) != 0); + const bool doShiftForces = ((flags & GMX_NONBONDED_DO_SHIFTFORCE) != 0); + const bool doPotential = ((flags & GMX_NONBONDED_DO_POTENTIAL) != 0); // Extract data from interaction_const_t - const real facel = ic.epsfac; - const real rCoulomb = ic.rcoulomb; - const real krf = ic.reactionFieldCoefficient; - const real crf = ic.reactionFieldShift; - const real shLjEwald = ic.sh_lj_ewald; - const real rVdw = ic.rvdw; - const real dispersionShift = ic.dispersion_shift.cpot; - const real repulsionShift = ic.repulsion_shift.cpot; + const real facel = ic.epsfac; + const real rCoulomb = ic.rcoulomb; + const real krf = ic.reactionFieldCoefficient; + const real crf = ic.reactionFieldShift; + const real gmx_unused shLjEwald = ic.sh_lj_ewald; + const real rVdw = ic.rvdw; + const real dispersionShift = ic.dispersion_shift.cpot; + const real repulsionShift = ic.repulsion_shift.cpot; + const real ewaldBeta = ic.ewaldcoeff_q; + real gmx_unused ewaldLJCoeffSq; + real gmx_unused ewaldLJCoeffSixDivSix; + if constexpr (vdwInteractionTypeIsEwald) + { + ewaldLJCoeffSq = ic.ewaldcoeff_lj * ic.ewaldcoeff_lj; + ewaldLJCoeffSixDivSix = ewaldLJCoeffSq * ewaldLJCoeffSq * ewaldLJCoeffSq / six; + } // Note that the nbnxm kernels do not support Coulomb potential switching at all GMX_ASSERT(ic.coulomb_modifier != InteractionModifiers::PotSwitch, "Potential switching is not supported for Coulomb with FEP"); - real vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4; - if (vdwModifierIsPotSwitch) + const real rVdwSwitch = ic.rvdw_switch; + real gmx_unused vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4; + if constexpr (vdwModifierIsPotSwitch) { - const real d = ic.rvdw - ic.rvdw_switch; - vdw_swV3 = -10.0 / (d * d * d); - vdw_swV4 = 15.0 / (d * d * d * d); - vdw_swV5 = -6.0 / (d * d * d * d * d); - vdw_swF2 = -30.0 / (d * d * d); - vdw_swF3 = 60.0 / (d * d * d * d); - vdw_swF4 = -30.0 / (d * d * d * d * d); + const real d = rVdw - rVdwSwitch; + vdw_swV3 = -10.0_real / (d * d * d); + vdw_swV4 = 15.0_real / (d * d * d * d); + vdw_swV5 = -6.0_real / (d * d * d * d * d); + vdw_swF2 = -30.0_real / (d * d * d); + vdw_swF3 = 60.0_real / (d * d * d * d); + vdw_swF4 = -30.0_real / (d * d * d * d * d); } else { /* Avoid warnings from stupid compilers (looking at you, Clang!) */ - vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = 0.0; + vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = zero; } NbkernelElecType icoul; @@ -299,33 +343,11 @@ static void nb_free_energy_kernel(const t_nblist& nlist, real rcutoff_max2 = std::max(ic.rcoulomb, ic.rvdw); rcutoff_max2 = rcutoff_max2 * rcutoff_max2; - const real* tab_ewald_F_lj = nullptr; - const real* tab_ewald_V_lj = nullptr; - const real* ewtab = nullptr; - real coulombTableScale = 0; - real coulombTableScaleInvHalf = 0; - real vdwTableScale = 0; - real vdwTableScaleInvHalf = 0; - real sh_ewald = 0; - if (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald) + real gmx_unused sh_ewald = 0; + if constexpr (elecInteractionTypeIsEwald || vdwInteractionTypeIsEwald) { sh_ewald = ic.sh_ewald; } - if (elecInteractionTypeIsEwald) - { - const auto& coulombTables = *ic.coulombEwaldTables; - ewtab = coulombTables.tableFDV0.data(); - coulombTableScale = coulombTables.scale; - coulombTableScaleInvHalf = half / coulombTableScale; - } - if (vdwInteractionTypeIsEwald) - { - const auto& vdwTables = *ic.vdwEwaldTables; - tab_ewald_F_lj = vdwTables.tableF.data(); - tab_ewald_V_lj = vdwTables.tableV.data(); - vdwTableScale = vdwTables.scale; - vdwTableScaleInvHalf = half / vdwTableScale; - } /* For Ewald/PME interactions we cannot easily apply the soft-core component to * reciprocal space. When we use non-switched Ewald interactions, we @@ -342,8 +364,8 @@ static void nb_free_energy_kernel(const t_nblist& nlist, GMX_RELEASE_ASSERT(!(vdwInteractionTypeIsEwald && vdwModifierIsPotSwitch), "Can not apply soft-core to switched Ewald potentials"); - real dvdlCoul = 0; - real dvdlVdw = 0; + RealType dvdlCoul(zero); + RealType dvdlVdw(zero); /* Lambda factor for state A, 1-lambda*/ real LFC[NSTATES], LFV[NSTATES]; @@ -356,11 +378,11 @@ static void nb_free_energy_kernel(const t_nblist& nlist, /*derivative of the lambda factor for state A and B */ real DLF[NSTATES]; - DLF[STATE_A] = -1; - DLF[STATE_B] = 1; + DLF[STATE_A] = -one; + DLF[STATE_B] = one; - real lFacCoul[NSTATES], dlFacCoul[NSTATES], lFacVdw[NSTATES], dlFacVdw[NSTATES]; - constexpr real sc_r_power = 6.0_real; + real gmx_unused lFacCoul[NSTATES], dlFacCoul[NSTATES], lFacVdw[NSTATES], dlFacVdw[NSTATES]; + constexpr real sc_r_power = six; for (int i = 0; i < NSTATES; i++) { lFacCoul[i] = (lam_power == 2 ? (1 - LFC[i]) * (1 - LFC[i]) : (1 - LFC[i])); @@ -370,57 +392,172 @@ static void nb_free_energy_kernel(const t_nblist& nlist, } // TODO: We should get rid of using pointers to real - const real* x = coords[0]; - real* gmx_restrict f = &(forceWithShiftForces->force()[0][0]); - real* gmx_restrict fshift = &(forceWithShiftForces->shiftForces()[0][0]); + const real* gmx_restrict x = coords.paddedConstArrayRef().data()[0]; const real rlistSquared = gmx::square(rlist); - int numExcludedPairsBeyondRlist = 0; + bool haveExcludedPairsBeyondRlist = false; for (int n = 0; n < nri; n++) { - int npair_within_cutoff = 0; - - const int is = shift[n]; - const int is3 = DIM * is; - const real shX = shiftvec[is][XX]; - const real shY = shiftvec[is][YY]; - const real shZ = shiftvec[is][ZZ]; - const int nj0 = jindex[n]; - const int nj1 = jindex[n + 1]; - const int ii = iinr[n]; - const int ii3 = 3 * ii; - const real ix = shX + x[ii3 + 0]; - const real iy = shY + x[ii3 + 1]; - const real iz = shZ + x[ii3 + 2]; - const real iqA = facel * chargeA[ii]; - const real iqB = facel * chargeB[ii]; - const int ntiA = 2 * ntype * typeA[ii]; - const int ntiB = 2 * ntype * typeB[ii]; - real vCTot = 0; - real vVTot = 0; - real fIX = 0; - real fIY = 0; - real fIZ = 0; - - for (int k = nj0; k < nj1; k++) + bool havePairsWithinCutoff = false; + + const int is = shift[n]; + const real shX = shiftvec[is][XX]; + const real shY = shiftvec[is][YY]; + const real shZ = shiftvec[is][ZZ]; + const int nj0 = jindex[n]; + const int nj1 = jindex[n + 1]; + const int ii = iinr[n]; + const int ii3 = 3 * ii; + const real ix = shX + x[ii3 + 0]; + const real iy = shY + x[ii3 + 1]; + const real iz = shZ + x[ii3 + 2]; + const real iqA = facel * chargeA[ii]; + const real iqB = facel * chargeB[ii]; + const int ntiA = ntype * typeA[ii]; + const int ntiB = ntype * typeB[ii]; + RealType vCTot(0); + RealType vVTot(0); + RealType fIX(0); + RealType fIY(0); + RealType fIZ(0); + +#if GMX_SIMD_HAVE_REAL + alignas(GMX_SIMD_ALIGNMENT) int preloadIi[DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) int preloadIs[DataTypes::simdRealWidth]; +#else + int preloadIi[DataTypes::simdRealWidth]; + int preloadIs[DataTypes::simdRealWidth]; +#endif + for (int s = 0; s < DataTypes::simdRealWidth; s++) { - int tj[NSTATES]; - const int jnr = jjnr[k]; - const int j3 = 3 * jnr; - RealType c6[NSTATES], c12[NSTATES], qq[NSTATES], vCoul[NSTATES], vVdw[NSTATES]; - RealType r, rInv, rp, rpm2; - RealType alphaVdwEff, alphaCoulEff, sigma6[NSTATES]; - const RealType dX = ix - x[j3]; - const RealType dY = iy - x[j3 + 1]; - const RealType dZ = iz - x[j3 + 2]; + preloadIi[s] = ii; + preloadIs[s] = shift[n]; + } + IntType ii_s = gmx::load(preloadIi); + + for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth) + { + RealType r, rInv; + +#if GMX_SIMD_HAVE_REAL + alignas(GMX_SIMD_ALIGNMENT) real preloadPairIsValid[DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) real preloadPairIncluded[DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) int32_t preloadJnr[DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) int32_t typeIndices[NSTATES][DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) real preloadQq[NSTATES][DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth]; + alignas(GMX_SIMD_ALIGNMENT) real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth]; +#else + real preloadPairIsValid[DataTypes::simdRealWidth]; + real preloadPairIncluded[DataTypes::simdRealWidth]; + int preloadJnr[DataTypes::simdRealWidth]; + int typeIndices[NSTATES][DataTypes::simdRealWidth]; + real preloadQq[NSTATES][DataTypes::simdRealWidth]; + real gmx_unused preloadSigma6[NSTATES][DataTypes::simdRealWidth]; + real gmx_unused preloadAlphaVdwEff[DataTypes::simdRealWidth]; + real gmx_unused preloadAlphaCoulEff[DataTypes::simdRealWidth]; + real preloadLjPmeC6Grid[NSTATES][DataTypes::simdRealWidth]; +#endif + for (int s = 0; s < DataTypes::simdRealWidth; s++) + { + if (k + s < nj1) + { + preloadPairIsValid[s] = true; + /* Check if this pair on the exclusions list.*/ + preloadPairIncluded[s] = (nlist.excl_fep.empty() || nlist.excl_fep[k + s]); + const int jnr = jjnr[k + s]; + preloadJnr[s] = jnr; + typeIndices[STATE_A][s] = ntiA + typeA[jnr]; + typeIndices[STATE_B][s] = ntiB + typeB[jnr]; + preloadQq[STATE_A][s] = iqA * chargeA[jnr]; + preloadQq[STATE_B][s] = iqB * chargeB[jnr]; + + for (int i = 0; i < NSTATES; i++) + { + if constexpr (vdwInteractionTypeIsEwald) + { + preloadLjPmeC6Grid[i][s] = nbfp_grid[2 * typeIndices[i][s]]; + } + else + { + preloadLjPmeC6Grid[i][s] = 0; + } + if constexpr (useSoftCore) + { + const real c6 = nbfp[2 * typeIndices[i][s]]; + const real c12 = nbfp[2 * typeIndices[i][s] + 1]; + if (c6 > 0 && c12 > 0) + { + /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */ + preloadSigma6[i][s] = 0.5_real * c12 / c6; + if (preloadSigma6[i][s] + < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */ + { + preloadSigma6[i][s] = sigma6_min; + } + } + else + { + preloadSigma6[i][s] = sigma6_def; + } + } + } + if constexpr (useSoftCore) + { + /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/ + const real c12A = nbfp[2 * typeIndices[STATE_A][s] + 1]; + const real c12B = nbfp[2 * typeIndices[STATE_B][s] + 1]; + if (c12A > 0 && c12B > 0) + { + preloadAlphaVdwEff[s] = 0; + preloadAlphaCoulEff[s] = 0; + } + else + { + preloadAlphaVdwEff[s] = alpha_vdw; + preloadAlphaCoulEff[s] = alpha_coul; + } + } + } + else + { + preloadJnr[s] = jjnr[k]; + preloadPairIsValid[s] = false; + preloadPairIncluded[s] = false; + preloadAlphaVdwEff[s] = 0; + preloadAlphaCoulEff[s] = 0; + + for (int i = 0; i < NSTATES; i++) + { + typeIndices[STATE_A][s] = ntiA + typeA[jjnr[k]]; + typeIndices[STATE_B][s] = ntiB + typeB[jjnr[k]]; + preloadLjPmeC6Grid[i][s] = 0; + preloadQq[i][s] = 0; + preloadSigma6[i][s] = 0; + } + } + } + + RealType jx, jy, jz; + gmx::gatherLoadUTranspose<3>(reinterpret_cast(x), preloadJnr, &jx, &jy, &jz); + + const RealType pairIsValid = gmx::load(preloadPairIsValid); + const RealType pairIncluded = gmx::load(preloadPairIncluded); + const BoolType bPairIncluded = (pairIncluded != zero); + const BoolType bPairExcluded = (pairIncluded == zero && pairIsValid != zero); + + const RealType dX = ix - jx; + const RealType dY = iy - jy; + const RealType dZ = iz - jz; const RealType rSq = dX * dX + dY * dY + dZ * dZ; - RealType fScalC[NSTATES], fScalV[NSTATES]; - /* Check if this pair on the exlusions list.*/ - const bool bPairIncluded = nlist.excl_fep.empty() || nlist.excl_fep[k]; - if (rSq >= rcutoff_max2 && bPairIncluded) + BoolType withinCutoffMask = (rSq < rcutoff_max2); + + if (!gmx::anyTrue(withinCutoffMask || bPairExcluded)) { /* We save significant time by skipping all code below. * Note that with soft-core interactions, the actual cut-off @@ -430,38 +567,55 @@ static void nb_free_energy_kernel(const t_nblist& nlist, * when using Ewald: the reciprocal-space * Ewald component still needs to be subtracted. */ - continue; } - npair_within_cutoff++; - - if (rSq > rlistSquared) + else { - numExcludedPairsBeyondRlist++; + havePairsWithinCutoff = true; } - if (rSq > 0) + if (gmx::anyTrue(rlistSquared < rSq && bPairExcluded)) { - /* Note that unlike in the nbnxn kernels, we do not need - * to clamp the value of rSq before taking the invsqrt - * to avoid NaN in the LJ calculation, since here we do - * not calculate LJ interactions when C6 and C12 are zero. - */ + haveExcludedPairsBeyondRlist = true; + } - rInv = gmx::invsqrt(rSq); - r = rSq * rInv; + const IntType jnr_s = gmx::load(preloadJnr); + const BoolType bIiEqJnr = gmx::cvtIB2B(ii_s == jnr_s); + + RealType c6[NSTATES]; + RealType c12[NSTATES]; + RealType gmx_unused sigma6[NSTATES]; + RealType qq[NSTATES]; + RealType gmx_unused ljPmeC6Grid[NSTATES]; + RealType gmx_unused alphaVdwEff; + RealType gmx_unused alphaCoulEff; + for (int i = 0; i < NSTATES; i++) + { + gmx::gatherLoadTranspose<2>(nbfp.data(), typeIndices[i], &c6[i], &c12[i]); + qq[i] = gmx::load(preloadQq[i]); + ljPmeC6Grid[i] = gmx::load(preloadLjPmeC6Grid[i]); + if constexpr (useSoftCore) + { + sigma6[i] = gmx::load(preloadSigma6[i]); + } } - else + if constexpr (useSoftCore) { - /* The force at r=0 is zero, because of symmetry. - * But note that the potential is in general non-zero, - * since the soft-cored r will be non-zero. - */ - rInv = 0; - r = 0; + alphaVdwEff = gmx::load(preloadAlphaVdwEff); + alphaCoulEff = gmx::load(preloadAlphaCoulEff); } - if (useSoftCore) + BoolType rSqValid = (zero < rSq); + + /* The force at r=0 is zero, because of symmetry. + * But note that the potential is in general non-zero, + * since the soft-cored r will be non-zero. + */ + rInv = gmx::maskzInvsqrt(rSq, rSqValid); + r = rSq * rInv; + + RealType gmx_unused rp, rpm2; + if constexpr (useSoftCore) { rpm2 = rSq * rSq; /* r4 */ rp = rpm2 * rSq; /* r6 */ @@ -473,79 +627,45 @@ static void nb_free_energy_kernel(const t_nblist& nlist, * the simplest math and cheapest code. */ rpm2 = rInv * rInv; - rp = 1; + rp = one; } - RealType fScal = 0; + RealType fScal(0); - qq[STATE_A] = iqA * chargeA[jnr]; - qq[STATE_B] = iqB * chargeB[jnr]; - - tj[STATE_A] = ntiA + 2 * typeA[jnr]; - tj[STATE_B] = ntiB + 2 * typeB[jnr]; - - if (bPairIncluded) + /* The following block is masked to only calculate values having bPairIncluded. If + * bPairIncluded is true then withinCutoffMask must also be true. */ + if (gmx::anyTrue(withinCutoffMask && bPairIncluded)) { - c6[STATE_A] = nbfp[tj[STATE_A]]; - c6[STATE_B] = nbfp[tj[STATE_B]]; - + RealType fScalC[NSTATES], fScalV[NSTATES]; + RealType vCoul[NSTATES], vVdw[NSTATES]; for (int i = 0; i < NSTATES; i++) { - c12[i] = nbfp[tj[i] + 1]; - if (useSoftCore) + fScalC[i] = zero; + fScalV[i] = zero; + vCoul[i] = zero; + vVdw[i] = zero; + + RealType gmx_unused rInvC, rInvV, rC, rV, rPInvC, rPInvV; + + /* The following block is masked to require (qq[i] != 0 || c6[i] != 0 || c12[i] + * != 0) in addition to bPairIncluded, which in turn requires withinCutoffMask. */ + BoolType nonZeroState = ((qq[i] != zero || c6[i] != zero || c12[i] != zero) + && bPairIncluded && withinCutoffMask); + if (gmx::anyTrue(nonZeroState)) { - if ((c6[i] > 0) && (c12[i] > 0)) - { - /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */ - sigma6[i] = half * c12[i] / c6[i]; - if (sigma6[i] < sigma6_min) /* for disappearing coul and vdw with soft core at the same time */ - { - sigma6[i] = sigma6_min; - } - } - else + if constexpr (useSoftCore) { - sigma6[i] = sigma6_def; - } - } - } - - if (useSoftCore) - { - /* only use softcore if one of the states has a zero endstate - softcore is for avoiding infinities!*/ - if ((c12[STATE_A] > 0) && (c12[STATE_B] > 0)) - { - alphaVdwEff = 0; - alphaCoulEff = 0; - } - else - { - alphaVdwEff = alpha_vdw; - alphaCoulEff = alpha_coul; - } - } - - for (int i = 0; i < NSTATES; i++) - { - fScalC[i] = 0; - fScalV[i] = 0; - vCoul[i] = 0; - vVdw[i] = 0; - - RealType rInvC, rInvV, rC, rV, rPInvC, rPInvV; + RealType divisor = (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp); + BoolType validDivisor = (zero < divisor); + rPInvC = gmx::maskzInv(divisor, validDivisor); + pthRoot(rPInvC, &rInvC, &rC, validDivisor); - /* Only spend time on A or B state if it is non-zero */ - if ((qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0)) - { - /* this section has to be inside the loop because of the dependence on sigma6 */ - if (useSoftCore) - { - rPInvC = one / (alphaCoulEff * lFacCoul[i] * sigma6[i] + rp); - pthRoot(rPInvC, &rInvC, &rC); - if (scLambdasOrAlphasDiffer) + if constexpr (scLambdasOrAlphasDiffer) { - rPInvV = one / (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp); - pthRoot(rPInvV, &rInvV, &rV); + RealType divisor = (alphaVdwEff * lFacVdw[i] * sigma6[i] + rp); + BoolType validDivisor = (zero < divisor); + rPInvV = gmx::maskzInv(divisor, validDivisor); + pthRoot(rPInvV, &rInvV, &rV, validDivisor); } else { @@ -557,25 +677,31 @@ static void nb_free_energy_kernel(const t_nblist& nlist, } else { - rPInvC = 1; + rPInvC = one; rInvC = rInv; rC = r; - rPInvV = 1; + rPInvV = one; rInvV = rInv; rV = r; } - /* Only process the coulomb interactions if we have charges, - * and if we either include all entries in the list (no cutoff + /* Only process the coulomb interactions if we either + * include all entries in the list (no cutoff * used in the kernel), or if we are within the cutoff. */ - bool computeElecInteraction = (elecInteractionTypeIsEwald && r < rCoulomb) - || (!elecInteractionTypeIsEwald && rC < rCoulomb); - - if ((qq[i] != 0) && computeElecInteraction) + BoolType computeElecInteraction; + if constexpr (elecInteractionTypeIsEwald) { - if (elecInteractionTypeIsEwald) + computeElecInteraction = (r < rCoulomb && qq[i] != zero && bPairIncluded); + } + else + { + computeElecInteraction = (rC < rCoulomb && qq[i] != zero && bPairIncluded); + } + if (gmx::anyTrue(computeElecInteraction)) + { + if constexpr (elecInteractionTypeIsEwald) { vCoul[i] = ewaldPotential(qq[i], rInvC, sh_ewald); fScalC[i] = ewaldScalarForce(qq[i], rInvC); @@ -585,19 +711,30 @@ static void nb_free_energy_kernel(const t_nblist& nlist, vCoul[i] = reactionFieldPotential(qq[i], rInvC, rC, krf, crf); fScalC[i] = reactionFieldScalarForce(qq[i], rInvC, rC, krf, two); } + + vCoul[i] = gmx::selectByMask(vCoul[i], computeElecInteraction); + fScalC[i] = gmx::selectByMask(fScalC[i], computeElecInteraction); } - /* Only process the VDW interactions if we have - * some non-zero parameters, and if we either + /* Only process the VDW interactions if we either * include all entries in the list (no cutoff used * in the kernel), or if we are within the cutoff. */ - bool computeVdwInteraction = (vdwInteractionTypeIsEwald && r < rVdw) - || (!vdwInteractionTypeIsEwald && rV < rVdw); - if ((c6[i] != 0 || c12[i] != 0) && computeVdwInteraction) + BoolType computeVdwInteraction; + if constexpr (vdwInteractionTypeIsEwald) + { + computeVdwInteraction = + (r < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded); + } + else + { + computeVdwInteraction = + (rV < rVdw && (c6[i] != 0 || c12[i] != 0) && bPairIncluded); + } + if (gmx::anyTrue(computeVdwInteraction)) { RealType rInv6; - if (useSoftCore) + if constexpr (useSoftCore) { rInv6 = rPInvV; } @@ -612,26 +749,33 @@ static void nb_free_energy_kernel(const t_nblist& nlist, vVdw6, vVdw12, c6[i], c12[i], repulsionShift, dispersionShift, oneSixth, oneTwelfth); fScalV[i] = lennardJonesScalarForce(vVdw6, vVdw12); - if (vdwInteractionTypeIsEwald) + if constexpr (vdwInteractionTypeIsEwald) { /* Subtract the grid potential at the cut-off */ - vVdw[i] += ewaldLennardJonesGridSubtract( - nbfp_grid[tj[i]], shLjEwald, oneSixth); + vVdw[i] = vVdw[i] + + gmx::selectByMask(ewaldLennardJonesGridSubtract( + ljPmeC6Grid[i], shLjEwald, oneSixth), + computeVdwInteraction); } - if (vdwModifierIsPotSwitch) + if constexpr (vdwModifierIsPotSwitch) { - RealType d = rV - ic.rvdw_switch; - d = (d > zero) ? d : zero; - const RealType d2 = d * d; + RealType d = rV - rVdwSwitch; + BoolType zeroMask = zero < d; + BoolType potSwitchMask = rV < rVdw; + d = gmx::selectByMask(d, zeroMask); + const RealType d2 = d * d; const RealType sw = one + d2 * d * (vdw_swV3 + d * (vdw_swV4 + d * vdw_swV5)); const RealType dsw = d2 * (vdw_swF2 + d * (vdw_swF3 + d * vdw_swF4)); fScalV[i] = potSwitchScalarForceMod( - fScalV[i], vVdw[i], sw, rV, rVdw, dsw, zero); - vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, rV, rVdw, zero); + fScalV[i], vVdw[i], sw, rV, dsw, potSwitchMask); + vVdw[i] = potSwitchPotentialMod(vVdw[i], sw, potSwitchMask); } + + vVdw[i] = gmx::selectByMask(vVdw[i], computeVdwInteraction); + fScalV[i] = gmx::selectByMask(fScalV[i], computeVdwInteraction); } /* fScalC (and fScalV) now contain: dV/drC * rC @@ -639,57 +783,69 @@ static void nb_free_energy_kernel(const t_nblist& nlist, * Further down we first multiply by r^p-2 and then by * the vector r, which in total gives: dV/drC * (r/rC)^1-p */ - fScalC[i] *= rPInvC; - fScalV[i] *= rPInvV; - } - } // end for (int i = 0; i < NSTATES; i++) - - /* Assemble A and B states */ - for (int i = 0; i < NSTATES; i++) + fScalC[i] = fScalC[i] * rPInvC; + fScalV[i] = fScalV[i] * rPInvV; + } // end of block requiring nonZeroState + } // end for (int i = 0; i < NSTATES; i++) + + /* Assemble A and B states. */ + BoolType assembleStates = (bPairIncluded && withinCutoffMask); + if (gmx::anyTrue(assembleStates)) { - vCTot += LFC[i] * vCoul[i]; - vVTot += LFV[i] * vVdw[i]; + for (int i = 0; i < NSTATES; i++) + { + vCTot = vCTot + LFC[i] * vCoul[i]; + vVTot = vVTot + LFV[i] * vVdw[i]; - fScal += LFC[i] * fScalC[i] * rpm2; - fScal += LFV[i] * fScalV[i] * rpm2; + fScal = fScal + LFC[i] * fScalC[i] * rpm2; + fScal = fScal + LFV[i] * fScalV[i] * rpm2; - if (useSoftCore) - { - dvdlCoul += vCoul[i] * DLF[i] - + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i]; - dvdlVdw += vVdw[i] * DLF[i] - + LFV[i] * alphaVdwEff * dlFacVdw[i] * fScalV[i] * sigma6[i]; - } - else - { - dvdlCoul += vCoul[i] * DLF[i]; - dvdlVdw += vVdw[i] * DLF[i]; + if constexpr (useSoftCore) + { + dvdlCoul = dvdlCoul + vCoul[i] * DLF[i] + + LFC[i] * alphaCoulEff * dlFacCoul[i] * fScalC[i] * sigma6[i]; + dvdlVdw = dvdlVdw + vVdw[i] * DLF[i] + + LFV[i] * alphaVdwEff * dlFacVdw[i] * fScalV[i] * sigma6[i]; + } + else + { + dvdlCoul = dvdlCoul + vCoul[i] * DLF[i]; + dvdlVdw = dvdlVdw + vVdw[i] * DLF[i]; + } } } - } // end if (bPairIncluded) - else if (icoul == NbkernelElecType::ReactionField) + } // end of block requiring bPairIncluded && withinCutoffMask + /* In the following block bPairIncluded should be false in the masks. */ + if (icoul == NbkernelElecType::ReactionField) { - /* For excluded pairs, which are only in this pair list when - * using the Verlet scheme, we don't use soft-core. - * As there is no singularity, there is no need for soft-core. - */ - const real FF = -two * krf; - RealType VV = krf * rSq - crf; + const BoolType computeReactionField = bPairExcluded; - if (ii == jnr) + if (gmx::anyTrue(computeReactionField)) { - VV *= half; - } + /* For excluded pairs we don't use soft-core. + * As there is no singularity, there is no need for soft-core. + */ + const RealType FF = -two * krf; + RealType VV = krf * rSq - crf; - for (int i = 0; i < NSTATES; i++) - { - vCTot += LFC[i] * qq[i] * VV; - fScal += LFC[i] * qq[i] * FF; - dvdlCoul += DLF[i] * qq[i] * VV; + /* If ii == jnr the i particle (ii) has itself (jnr) + * in its neighborlist. This corresponds to a self-interaction + * that will occur twice. Scale it down by 50% to only include + * it once. + */ + VV = VV * gmx::blend(one, half, bIiEqJnr); + + for (int i = 0; i < NSTATES; i++) + { + vCTot = vCTot + gmx::selectByMask(LFC[i] * qq[i] * VV, computeReactionField); + fScal = fScal + gmx::selectByMask(LFC[i] * qq[i] * FF, computeReactionField); + dvdlCoul = dvdlCoul + gmx::selectByMask(DLF[i] * qq[i] * VV, computeReactionField); + } } } - if (elecInteractionTypeIsEwald && (r < rCoulomb || !bPairIncluded)) + const BoolType computeElecEwaldInteraction = (bPairExcluded || r < rCoulomb); + if (elecInteractionTypeIsEwald && gmx::anyTrue(computeElecEwaldInteraction)) { /* See comment in the preamble. When using Ewald interactions * (unless we use a switch modifier) we subtract the reciprocal-space @@ -699,39 +855,33 @@ static void nb_free_energy_kernel(const t_nblist& nlist, * the softcore to the entire electrostatic interaction, * including the reciprocal-space component. */ - real v_lr, f_lr; + RealType v_lr, f_lr; - const RealType ewrt = r * coulombTableScale; - IntType ewitab = static_cast(ewrt); - const RealType eweps = ewrt - ewitab; - ewitab = 4 * ewitab; - f_lr = ewtab[ewitab] + eweps * ewtab[ewitab + 1]; - v_lr = (ewtab[ewitab + 2] - coulombTableScaleInvHalf * eweps * (ewtab[ewitab] + f_lr)); - f_lr *= rInv; + pmeCoulombCorrectionVF(rSq, ewaldBeta, &v_lr, &f_lr, rSqValid); + f_lr = f_lr * rInv * rInv; /* Note that any possible Ewald shift has already been applied in * the normal interaction part above. */ - if (ii == jnr) - { - /* If we get here, the i particle (ii) has itself (jnr) - * in its neighborlist. This can only happen with the Verlet - * scheme, and corresponds to a self-interaction that will - * occur twice. Scale it down by 50% to only include it once. - */ - v_lr *= half; - } + /* If ii == jnr the i particle (ii) has itself (jnr) + * in its neighborlist. This corresponds to a self-interaction + * that will occur twice. Scale it down by 50% to only include + * it once. + */ + v_lr = v_lr * gmx::blend(one, half, bIiEqJnr); for (int i = 0; i < NSTATES; i++) { - vCTot -= LFC[i] * qq[i] * v_lr; - fScal -= LFC[i] * qq[i] * f_lr; - dvdlCoul -= (DLF[i] * qq[i]) * v_lr; + vCTot = vCTot - gmx::selectByMask(LFC[i] * qq[i] * v_lr, computeElecEwaldInteraction); + fScal = fScal - gmx::selectByMask(LFC[i] * qq[i] * f_lr, computeElecEwaldInteraction); + dvdlCoul = dvdlCoul + - gmx::selectByMask(DLF[i] * qq[i] * v_lr, computeElecEwaldInteraction); } } - if (vdwInteractionTypeIsEwald && (r < rVdw || !bPairIncluded)) + const BoolType computeVdwEwaldInteraction = (bPairExcluded || r < rVdw); + if (vdwInteractionTypeIsEwald && gmx::anyTrue(computeVdwEwaldInteraction)) { /* See comment in the preamble. When using LJ-Ewald interactions * (unless we use a switch modifier) we subtract the reciprocal-space @@ -741,147 +891,105 @@ static void nb_free_energy_kernel(const t_nblist& nlist, * the softcore to the entire VdW interaction, * including the reciprocal-space component. */ - /* We could also use the analytical form here - * iso a table, but that can cause issues for - * r close to 0 for non-interacting pairs. - */ - const RealType rs = rSq * rInv * vdwTableScale; - const IntType ri = static_cast(rs); - const RealType frac = rs - ri; - const RealType f_lr = (1 - frac) * tab_ewald_F_lj[ri] + frac * tab_ewald_F_lj[ri + 1]; - /* TODO: Currently the Ewald LJ table does not contain - * the factor 1/6, we should add this. - */ - const RealType FF = f_lr * rInv / six; - RealType VV = - (tab_ewald_V_lj[ri] - vdwTableScaleInvHalf * frac * (tab_ewald_F_lj[ri] + f_lr)) - / six; - - if (ii == jnr) - { - /* If we get here, the i particle (ii) has itself (jnr) - * in its neighborlist. This can only happen with the Verlet - * scheme, and corresponds to a self-interaction that will - * occur twice. Scale it down by 50% to only include it once. - */ - VV *= half; - } + RealType v_lr, f_lr; + pmeLJCorrectionVF( + rInv, rSq, ewaldLJCoeffSq, ewaldLJCoeffSixDivSix, &v_lr, &f_lr, computeVdwEwaldInteraction, bIiEqJnr); + v_lr = v_lr * oneSixth; for (int i = 0; i < NSTATES; i++) { - const real c6grid = nbfp_grid[tj[i]]; - vVTot += LFV[i] * c6grid * VV; - fScal += LFV[i] * c6grid * FF; - dvdlVdw += (DLF[i] * c6grid) * VV; + vVTot = vVTot + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction); + fScal = fScal + gmx::selectByMask(LFV[i] * ljPmeC6Grid[i] * f_lr, computeVdwEwaldInteraction); + dvdlVdw = dvdlVdw + gmx::selectByMask(DLF[i] * ljPmeC6Grid[i] * v_lr, computeVdwEwaldInteraction); } } - if (doForces) + if (doForces && gmx::anyTrue(fScal != zero)) { - const real tX = fScal * dX; - const real tY = fScal * dY; - const real tZ = fScal * dZ; - fIX = fIX + tX; - fIY = fIY + tY; - fIZ = fIZ + tZ; - /* OpenMP atomics are expensive, but this kernels is also - * expensive, so we can take this hit, instead of using - * thread-local output buffers and extra reduction. - * - * All the OpenMP regions in this file are trivial and should - * not throw, so no need for try/catch. - */ -#pragma omp atomic - f[j3] -= tX; -#pragma omp atomic - f[j3 + 1] -= tY; -#pragma omp atomic - f[j3 + 2] -= tZ; + const RealType tX = fScal * dX; + const RealType tY = fScal * dY; + const RealType tZ = fScal * dZ; + fIX = fIX + tX; + fIY = fIY + tY; + fIZ = fIZ + tZ; + + gmx::transposeScatterDecrU<3>( + reinterpret_cast(threadForceBuffer), preloadJnr, tX, tY, tZ); } - } // end for (int k = nj0; k < nj1; k++) - - /* The atomics below are expensive with many OpenMP threads. - * Here unperturbed i-particles will usually only have a few - * (perturbed) j-particles in the list. Thus with a buffered list - * we can skip a significant number of i-reductions with a check. - */ - if (npair_within_cutoff > 0) + } // end for (int k = nj0; k < nj1; k += DataTypes::simdRealWidth) + + if (havePairsWithinCutoff) { if (doForces) { -#pragma omp atomic - f[ii3] += fIX; -#pragma omp atomic - f[ii3 + 1] += fIY; -#pragma omp atomic - f[ii3 + 2] += fIZ; + gmx::transposeScatterIncrU<3>( + reinterpret_cast(threadForceBuffer), preloadIi, fIX, fIY, fIZ); } if (doShiftForces) { -#pragma omp atomic - fshift[is3] += fIX; -#pragma omp atomic - fshift[is3 + 1] += fIY; -#pragma omp atomic - fshift[is3 + 2] += fIZ; + gmx::transposeScatterIncrU<3>( + reinterpret_cast(threadForceShiftBuffer), preloadIs, fIX, fIY, fIZ); } if (doPotential) { int ggid = gid[n]; -#pragma omp atomic - energygrp_elec[ggid] += vCTot; -#pragma omp atomic - energygrp_vdw[ggid] += vVTot; + threadVc[ggid] += gmx::reduce(vCTot); + threadVv[ggid] += gmx::reduce(vVTot); } } } // end for (int n = 0; n < nri; n++) -#pragma omp atomic - dvdl[static_cast(FreeEnergyPerturbationCouplingType::Coul)] += dvdlCoul; -#pragma omp atomic - dvdl[static_cast(FreeEnergyPerturbationCouplingType::Vdw)] += dvdlVdw; + if (gmx::anyTrue(dvdlCoul != zero)) + { + threadDvdl[static_cast(FreeEnergyPerturbationCouplingType::Coul)] += gmx::reduce(dvdlCoul); + } + if (gmx::anyTrue(dvdlVdw != zero)) + { + threadDvdl[static_cast(FreeEnergyPerturbationCouplingType::Vdw)] += gmx::reduce(dvdlVdw); + } /* Estimate flops, average for free energy stuff: * 12 flops per outer iteration * 150 flops per inner iteration + * TODO: Update the number of flops and/or use different counts for different code paths. */ atomicNrnbIncrement(nrnb, eNR_NBKERNEL_FREE_ENERGY, nlist.nri * 12 + nlist.jindex[nri] * 150); - if (numExcludedPairsBeyondRlist > 0) + if (haveExcludedPairsBeyondRlist > 0) { gmx_fatal(FARGS, - "There are %d perturbed non-bonded pair interactions beyond the pair-list cutoff " + "There are perturbed non-bonded pair interactions beyond the pair-list cutoff " "of %g nm, which is not supported. This can happen because the system is " "unstable or because intra-molecular interactions at long distances are " "excluded. If the " "latter is the case, you can try to increase nstlist or rlist to avoid this." "The error is likely triggered by the use of couple-intramol=no " "and the maximal distance in the decoupled molecule exceeding rlist.", - numExcludedPairsBeyondRlist, rlist); } } -typedef void (*KernelFunction)(const t_nblist& nlist, - gmx::ArrayRef coords, - gmx::ForceWithShiftForces* forceWithShiftForces, - const int ntype, - const real rlist, - const interaction_const_t& ic, - gmx::ArrayRef shiftvec, - gmx::ArrayRef nbfp, - gmx::ArrayRef nbfp_grid, - gmx::ArrayRef chargeA, - gmx::ArrayRef chargeB, - gmx::ArrayRef typeA, - gmx::ArrayRef typeB, - int flags, - gmx::ArrayRef lambda, - gmx::ArrayRef dvdl, - gmx::ArrayRef energygrp_elec, - gmx::ArrayRef energygrp_vdw, - t_nrnb* gmx_restrict nrnb); +typedef void (*KernelFunction)(const t_nblist& nlist, + const gmx::ArrayRefWithPadding& coords, + const int ntype, + const real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + int flags, + gmx::ArrayRef lambda, + t_nrnb* gmx_restrict nrnb, + gmx::RVec* threadForceBuffer, + rvec* threadForceShiftBuffer, + gmx::ArrayRef threadVc, + gmx::ArrayRef threadVv, + gmx::ArrayRef threadDvdl); template static KernelFunction dispatchKernelOnUseSimd(const bool useSimd) @@ -889,8 +997,7 @@ static KernelFunction dispatchKernelOnUseSimd(const bool useSimd) if (useSimd) { #if GMX_SIMD_HAVE_REAL && GMX_SIMD_HAVE_INT32_ARITHMETICS && GMX_USE_SIMD_KERNELS - /* FIXME: Here SimdDataTypes should be used to enable SIMD. So far, the code in nb_free_energy_kernel is not adapted to SIMD */ - return (nb_free_energy_kernel); + return (nb_free_energy_kernel); #else return (nb_free_energy_kernel); #endif @@ -996,26 +1103,27 @@ static KernelFunction dispatchKernel(const bool scLambdasOrAlpha } -void gmx_nb_free_energy_kernel(const t_nblist& nlist, - gmx::ArrayRef coords, - gmx::ForceWithShiftForces* ff, - const bool useSimd, - const int ntype, - const real rlist, - const interaction_const_t& ic, - gmx::ArrayRef shiftvec, - gmx::ArrayRef nbfp, - gmx::ArrayRef nbfp_grid, - gmx::ArrayRef chargeA, - gmx::ArrayRef chargeB, - gmx::ArrayRef typeA, - gmx::ArrayRef typeB, - int flags, - gmx::ArrayRef lambda, - gmx::ArrayRef dvdl, - gmx::ArrayRef energygrp_elec, - gmx::ArrayRef energygrp_vdw, - t_nrnb* nrnb) +void gmx_nb_free_energy_kernel(const t_nblist& nlist, + const gmx::ArrayRefWithPadding& coords, + const bool useSimd, + const int ntype, + const real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + int flags, + gmx::ArrayRef lambda, + t_nrnb* nrnb, + gmx::RVec* threadForceBuffer, + rvec* threadForceShiftBuffer, + gmx::ArrayRef threadVc, + gmx::ArrayRef threadVv, + gmx::ArrayRef threadDvdl) { GMX_ASSERT(EEL_PME_EWALD(ic.eeltype) || ic.eeltype == CoulombInteractionType::Cut || EEL_RF(ic.eeltype), "Unsupported eeltype with free energy"); @@ -1050,7 +1158,6 @@ void gmx_nb_free_energy_kernel(const t_nblist& nlist, ic); kernelFunc(nlist, coords, - ff, ntype, rlist, ic, @@ -1063,8 +1170,10 @@ void gmx_nb_free_energy_kernel(const t_nblist& nlist, typeB, flags, lambda, - dvdl, - energygrp_elec, - energygrp_vdw, - nrnb); + nrnb, + threadForceBuffer, + threadForceShiftBuffer, + threadVc, + threadVv, + threadDvdl); } diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.h b/src/gromacs/gmxlib/nonbonded/nb_free_energy.h index 3a04f196cb..d5c3188067 100644 --- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.h +++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.h @@ -47,30 +47,32 @@ struct t_nblist; struct interaction_const_t; namespace gmx { -class ForceWithShiftForces; template class ArrayRef; +template +class ArrayRefWithPadding; } // namespace gmx -void gmx_nb_free_energy_kernel(const t_nblist& nlist, - gmx::ArrayRef coords, - gmx::ForceWithShiftForces* forceWithShiftForces, - bool useSimd, - int ntype, - real rlist, - const interaction_const_t& ic, - gmx::ArrayRef shiftvec, - gmx::ArrayRef nbfp, - gmx::ArrayRef nbfp_grid, - gmx::ArrayRef chargeA, - gmx::ArrayRef chargeB, - gmx::ArrayRef typeA, - gmx::ArrayRef typeB, - int flags, - gmx::ArrayRef lambda, - gmx::ArrayRef dvdl, - gmx::ArrayRef energygrp_elec, - gmx::ArrayRef energygrp_vdw, - t_nrnb* gmx_restrict nrnb); +void gmx_nb_free_energy_kernel(const t_nblist& nlist, + const gmx::ArrayRefWithPadding& coords, + bool useSimd, + int ntype, + real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + int flags, + gmx::ArrayRef lambda, + t_nrnb* gmx_restrict nrnb, + gmx::RVec* threadForceBuffer, + rvec* threadForceShiftBuffer, + gmx::ArrayRef threadVc, + gmx::ArrayRef threadVv, + gmx::ArrayRef threadDvdl); #endif diff --git a/src/gromacs/gmxlib/nonbonded/tests/nb_free_energy.cpp b/src/gromacs/gmxlib/nonbonded/tests/nb_free_energy.cpp index 3ffc4d9ab1..d25b98c1dd 100644 --- a/src/gromacs/gmxlib/nonbonded/tests/nb_free_energy.cpp +++ b/src/gromacs/gmxlib/nonbonded/tests/nb_free_energy.cpp @@ -416,7 +416,7 @@ protected: // When the free-energy kernel switches from tabulated to analytical corrections, // the double precision tolerance can be tightend to 1e-11. test::FloatingPointTolerance tolerance( - input_.floatToler, input_.doubleToler, 1.0e-6, 1.0e-6, 10000, 100, false); + input_.floatToler, input_.doubleToler, 1.0e-6, 1.0e-11, 10000, 100, false); checker_.setDefaultTolerance(tolerance); } @@ -454,8 +454,7 @@ protected: // run fep kernel gmx_nb_free_energy_kernel(nbl, - x_.arrayRefWithPadding().unpaddedArrayRef(), - &forces, + x_.arrayRefWithPadding(), fr.use_simd_kernels, fr.ntype, fr.rlist, @@ -469,10 +468,12 @@ protected: input_.atoms.typeB, doNBFlags, lambdas, - output.dvdLambda, + &nrnb, + output.f.arrayRefWithPadding().paddedArrayRef().data(), + as_rvec_array(output.fShift.data()), output.energy.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR], output.energy.energyGroupPairTerms[NonBondedEnergyTerms::LJSR], - &nrnb); + output.dvdLambda); checkOutput(&checker_, output); } diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 13a13d23ea..1d703a7245 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1634,6 +1634,15 @@ void do_force(FILE* fplog, } } + // With FEP we set up the reduction over threads for local+non-local simultaneously, + // so we need to do that here after the local and non-local pairlist construction. + if (stepWork.doNeighborSearch && fr->efep != FreeEnergyPerturbationType::No) + { + wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep); + nbv->setupFepThreadedForceBuffer(fr->natoms_force_constr); + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep); + } + if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces) { /* launch D2H copy-back F */ @@ -1776,9 +1785,8 @@ void do_force(FILE* fplog, /* Calculate the local and non-local free energy interactions here. * Happens here on the CPU both with and without GPU. */ - nbv->dispatchFreeEnergyKernel( - InteractionLocality::Local, - x.unpaddedArrayRef(), + nbv->dispatchFreeEnergyKernels( + x, &forceOutNonbonded->forceWithShiftForces(), fr->use_simd_kernels, fr->ntype, @@ -1800,34 +1808,6 @@ void do_force(FILE* fplog, enerd, stepWork, nrnb); - - if (simulationWork.havePpDomainDecomposition) - { - nbv->dispatchFreeEnergyKernel( - InteractionLocality::NonLocal, - x.unpaddedArrayRef(), - &forceOutNonbonded->forceWithShiftForces(), - fr->use_simd_kernels, - fr->ntype, - fr->rlist, - *fr->ic, - fr->shift_vec, - fr->nbfp, - fr->ljpme_c6grid, - mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr) - : gmx::ArrayRef{}, - mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr) - : gmx::ArrayRef{}, - mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr) - : gmx::ArrayRef{}, - mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr) - : gmx::ArrayRef{}, - inputrec.fepvals.get(), - lambda, - enerd, - stepWork, - nrnb); - } } if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb) diff --git a/src/gromacs/nbnxm/CMakeLists.txt b/src/gromacs/nbnxm/CMakeLists.txt index d95b2c5772..ca5d092802 100644 --- a/src/gromacs/nbnxm/CMakeLists.txt +++ b/src/gromacs/nbnxm/CMakeLists.txt @@ -38,9 +38,10 @@ add_library(nbnxm INTERFACE) add_subdirectory(kernels_simd_4xm) add_subdirectory(kernels_simd_2xmm) -file (GLOB NBNXM_SOURCES +file(GLOB NBNXM_SOURCES # Source files atomdata.cpp + freeenergydispatch.cpp grid.cpp gridset.cpp kernel_common.cpp diff --git a/src/gromacs/nbnxm/freeenergydispatch.cpp b/src/gromacs/nbnxm/freeenergydispatch.cpp new file mode 100644 index 0000000000..826bd9ff61 --- /dev/null +++ b/src/gromacs/nbnxm/freeenergydispatch.cpp @@ -0,0 +1,462 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#include "gmxpre.h" + +#include "freeenergydispatch.h" + +#include "gromacs/gmxlib/nrnb.h" +#include "gromacs/gmxlib/nonbonded/nb_free_energy.h" +#include "gromacs/gmxlib/nonbonded/nonbonded.h" +#include "gromacs/math/vectypes.h" +#include "gromacs/mdlib/enerdata_utils.h" +#include "gromacs/mdlib/force.h" +#include "gromacs/mdlib/gmx_omp_nthreads.h" +#include "gromacs/mdtypes/enerdata.h" +#include "gromacs/mdtypes/forceoutput.h" +#include "gromacs/mdtypes/inputrec.h" +#include "gromacs/mdtypes/interaction_const.h" +#include "gromacs/mdtypes/md_enums.h" +#include "gromacs/mdtypes/nblist.h" +#include "gromacs/mdtypes/simulation_workload.h" +#include "gromacs/mdtypes/threaded_force_buffer.h" +#include "gromacs/nbnxm/nbnxm.h" +#include "gromacs/timing/wallcycle.h" +#include "gromacs/utility/enumerationhelpers.h" +#include "gromacs/utility/gmxassert.h" +#include "gromacs/utility/real.h" + +#include "pairlistset.h" +#include "pairlistsets.h" + +FreeEnergyDispatch::FreeEnergyDispatch(const int numEnergyGroups) : + foreignGroupPairEnergies_(numEnergyGroups), + threadedForceBuffer_(gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded), false, numEnergyGroups), + threadedForeignEnergyBuffer_(gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded), false, numEnergyGroups) +{ +} + +namespace +{ + +//! Flags all atoms present in pairlist \p nlist in the mask in \p threadForceBuffer +void setReductionMaskFromFepPairlist(const t_nblist& gmx_restrict nlist, + gmx::ThreadForceBuffer* threadForceBuffer) +{ + // Extract pair list data + gmx::ArrayRef iinr = nlist.iinr; + gmx::ArrayRef jjnr = nlist.jjnr; + + for (int i : iinr) + { + threadForceBuffer->addAtomToMask(i); + } + for (int j : jjnr) + { + threadForceBuffer->addAtomToMask(j); + } +} + +} // namespace + +void FreeEnergyDispatch::setupFepThreadedForceBuffer(const int numAtomsForce, const PairlistSets& pairlistSets) +{ + const int numThreads = threadedForceBuffer_.numThreadBuffers(); + + GMX_ASSERT(gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded) == numThreads, + "The number of buffers should be same as number of NB threads"); + +#pragma omp parallel for num_threads(numThreads) schedule(static) + for (int th = 0; th < numThreads; th++) + { + auto& threadForceBuffer = threadedForceBuffer_.threadForceBuffer(th); + + threadForceBuffer.resizeBufferAndClearMask(numAtomsForce); + + setReductionMaskFromFepPairlist( + *pairlistSets.pairlistSet(gmx::InteractionLocality::Local).fepLists()[th], + &threadForceBuffer); + if (pairlistSets.params().haveMultipleDomains) + { + setReductionMaskFromFepPairlist( + *pairlistSets.pairlistSet(gmx::InteractionLocality::NonLocal).fepLists()[th], + &threadForceBuffer); + } + + threadForceBuffer.processMask(); + } + + threadedForceBuffer_.setupReduction(); +} + +void nonbonded_verlet_t::setupFepThreadedForceBuffer(const int numAtomsForce) +{ + if (!pairlistSets_->params().haveFep) + { + return; + } + + GMX_RELEASE_ASSERT(freeEnergyDispatch_, "Need a valid dispatch object"); + + freeEnergyDispatch_->setupFepThreadedForceBuffer(numAtomsForce, *pairlistSets_); +} + +namespace +{ + +void dispatchFreeEnergyKernel(gmx::ArrayRef> nbl_fep, + const gmx::ArrayRefWithPadding& coords, + bool useSimd, + int ntype, + real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + t_lambda* fepvals, + gmx::ArrayRef lambda, + const bool clearForcesAndEnergies, + gmx::ThreadedForceBuffer* threadedForceBuffer, + gmx::ThreadedForceBuffer* threadedForeignEnergyBuffer, + gmx_grppairener_t* foreignGroupPairEnergies, + gmx_enerdata_t* enerd, + const gmx::StepWorkload& stepWork, + t_nrnb* nrnb) +{ + int donb_flags = 0; + /* Add short-range interactions */ + donb_flags |= GMX_NONBONDED_DO_SR; + + if (stepWork.computeForces) + { + donb_flags |= GMX_NONBONDED_DO_FORCE; + } + if (stepWork.computeVirial) + { + donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE; + } + if (stepWork.computeEnergy) + { + donb_flags |= GMX_NONBONDED_DO_POTENTIAL; + } + + GMX_ASSERT(gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded) == nbl_fep.ssize(), + "Number of lists should be same as number of NB threads"); + +#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) + for (gmx::index th = 0; th < nbl_fep.ssize(); th++) + { + try + { + auto& threadForceBuffer = threadedForceBuffer->threadForceBuffer(th); + + if (clearForcesAndEnergies) + { + threadForceBuffer.clearForcesAndEnergies(); + } + + gmx::RVec* threadForces = threadForceBuffer.forceBuffer(); + rvec* threadForceShiftBuffer = as_rvec_array(threadForceBuffer.shiftForces().data()); + gmx::ArrayRef threadVc = + threadForceBuffer.groupPairEnergies().energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR]; + gmx::ArrayRef threadVv = + threadForceBuffer.groupPairEnergies().energyGroupPairTerms[NonBondedEnergyTerms::LJSR]; + gmx::ArrayRef threadDvdl = threadForceBuffer.dvdl(); + + gmx_nb_free_energy_kernel(*nbl_fep[th], + coords, + useSimd, + ntype, + rlist, + ic, + shiftvec, + nbfp, + nbfp_grid, + chargeA, + chargeB, + typeA, + typeB, + donb_flags, + lambda, + nrnb, + threadForces, + threadForceShiftBuffer, + threadVc, + threadVv, + threadDvdl); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR + } + + /* If we do foreign lambda and we have soft-core interactions + * we have to recalculate the (non-linear) energies contributions. + */ + if (fepvals->n_lambda > 0 && stepWork.computeDhdl && fepvals->sc_alpha != 0) + { + gmx::StepWorkload stepWorkForeignEnergies = stepWork; + stepWorkForeignEnergies.computeForces = false; + stepWorkForeignEnergies.computeVirial = false; + + gmx::EnumerationArray lam_i; + gmx::EnumerationArray dvdl_nb = { 0 }; + const int kernelFlags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) + | GMX_NONBONDED_DO_FOREIGNLAMBDA; + + for (gmx::index i = 0; i < 1 + enerd->foreignLambdaTerms.numLambdas(); i++) + { + std::fill(std::begin(dvdl_nb), std::end(dvdl_nb), 0); + for (int j = 0; j < static_cast(FreeEnergyPerturbationCouplingType::Count); j++) + { + lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i - 1]); + } + +#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) + for (gmx::index th = 0; th < nbl_fep.ssize(); th++) + { + try + { + // Note that here we only compute energies and dV/dlambda, but we need + // to pass a force buffer. No forces are compute and stored. + auto& threadForeignEnergyBuffer = threadedForeignEnergyBuffer->threadForceBuffer(th); + + threadForeignEnergyBuffer.clearForcesAndEnergies(); + + gmx::ArrayRef threadVc = + threadForeignEnergyBuffer.groupPairEnergies() + .energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR]; + gmx::ArrayRef threadVv = + threadForeignEnergyBuffer.groupPairEnergies() + .energyGroupPairTerms[NonBondedEnergyTerms::LJSR]; + gmx::ArrayRef threadDvdl = threadForeignEnergyBuffer.dvdl(); + + gmx_nb_free_energy_kernel(*nbl_fep[th], + coords, + useSimd, + ntype, + rlist, + ic, + shiftvec, + nbfp, + nbfp_grid, + chargeA, + chargeB, + typeA, + typeB, + kernelFlags, + lam_i, + nrnb, + nullptr, + nullptr, + threadVc, + threadVv, + threadDvdl); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR + } + + foreignGroupPairEnergies->clear(); + threadedForeignEnergyBuffer->reduce( + nullptr, nullptr, foreignGroupPairEnergies, dvdl_nb, stepWorkForeignEnergies, 0); + + std::array foreign_term = { 0 }; + sum_epot(*foreignGroupPairEnergies, foreign_term.data()); + // Accumulate the foreign energy difference and dV/dlambda into the passed enerd + enerd->foreignLambdaTerms.accumulate( + i, + foreign_term[F_EPOT], + dvdl_nb[FreeEnergyPerturbationCouplingType::Vdw] + + dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]); + } + } +} + +} // namespace + +void FreeEnergyDispatch::dispatchFreeEnergyKernels(const PairlistSets& pairlistSets, + const gmx::ArrayRefWithPadding& coords, + gmx::ForceWithShiftForces* forceWithShiftForces, + const bool useSimd, + const int ntype, + const real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + t_lambda* fepvals, + gmx::ArrayRef lambda, + gmx_enerdata_t* enerd, + const gmx::StepWorkload& stepWork, + t_nrnb* nrnb, + gmx_wallcycle* wcycle) +{ + GMX_ASSERT(pairlistSets.params().haveFep, "We should have a free-energy pairlist"); + + wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep); + + const int numLocalities = (pairlistSets.params().haveMultipleDomains ? 2 : 1); + // The first call to dispatchFreeEnergyKernel() should clear the buffers. Clearing happens + // inside that function to avoid an extra OpenMP parallel region here. We need a boolean + // to track the need for clearing. + // A better solution would be to move the OpenMP parallel region here, but that first + // requires modifying ThreadedForceBuffer.reduce() to be called thread parallel. + bool clearForcesAndEnergies = true; + for (int i = 0; i < numLocalities; i++) + { + const gmx::InteractionLocality iLocality = static_cast(i); + const auto fepPairlists = pairlistSets.pairlistSet(iLocality).fepLists(); + /* When the first list is empty, all are empty and there is nothing to do */ + if (fepPairlists[0]->nrj > 0) + { + dispatchFreeEnergyKernel(fepPairlists, + coords, + useSimd, + ntype, + rlist, + ic, + shiftvec, + nbfp, + nbfp_grid, + chargeA, + chargeB, + typeA, + typeB, + fepvals, + lambda, + clearForcesAndEnergies, + &threadedForceBuffer_, + &threadedForeignEnergyBuffer_, + &foreignGroupPairEnergies_, + enerd, + stepWork, + nrnb); + } + else if (clearForcesAndEnergies) + { + // We need to clear the thread force buffer. + // With a non-empty pairlist we do this in dispatchFreeEnergyKernel() + // to avoid the overhead of an extra openMP parallel loop +#pragma omp parallel for schedule(static) num_threads(fepPairlists.ssize()) + for (gmx::index th = 0; th < fepPairlists.ssize(); th++) + { + try + { + threadedForceBuffer_.threadForceBuffer(th).clearForcesAndEnergies(); + } + GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR + } + } + clearForcesAndEnergies = false; + } + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep); + + wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFepReduction); + + gmx::EnumerationArray dvdl_nb = { 0 }; + + threadedForceBuffer_.reduce(forceWithShiftForces, nullptr, &enerd->grpp, dvdl_nb, stepWork, 0); + + if (fepvals->sc_alpha != 0) + { + enerd->dvdl_nonlin[FreeEnergyPerturbationCouplingType::Vdw] += + dvdl_nb[FreeEnergyPerturbationCouplingType::Vdw]; + enerd->dvdl_nonlin[FreeEnergyPerturbationCouplingType::Coul] += + dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]; + } + else + { + enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += + dvdl_nb[FreeEnergyPerturbationCouplingType::Vdw]; + enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Coul] += + dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]; + } + + wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFepReduction); +} + +void nonbonded_verlet_t::dispatchFreeEnergyKernels(const gmx::ArrayRefWithPadding& coords, + gmx::ForceWithShiftForces* forceWithShiftForces, + const bool useSimd, + const int ntype, + const real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + t_lambda* fepvals, + gmx::ArrayRef lambda, + gmx_enerdata_t* enerd, + const gmx::StepWorkload& stepWork, + t_nrnb* nrnb) +{ + if (!pairlistSets_->params().haveFep) + { + return; + } + + GMX_RELEASE_ASSERT(freeEnergyDispatch_, "Need a valid dispatch object"); + + freeEnergyDispatch_->dispatchFreeEnergyKernels(*pairlistSets_, + coords, + forceWithShiftForces, + useSimd, + ntype, + rlist, + ic, + shiftvec, + nbfp, + nbfp_grid, + chargeA, + chargeB, + typeA, + typeB, + fepvals, + lambda, + enerd, + stepWork, + nrnb, + wcycle_); +} diff --git a/src/gromacs/nbnxm/freeenergydispatch.h b/src/gromacs/nbnxm/freeenergydispatch.h new file mode 100644 index 0000000000..dc09a725cf --- /dev/null +++ b/src/gromacs/nbnxm/freeenergydispatch.h @@ -0,0 +1,112 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ +/*! \internal \file + * + * \brief + * Declares the free-energy kernel dispatch class + * + * \author Berk Hess + * \ingroup module_nbnxm + */ +#ifndef GMX_NBNXM_FREEENERGYDISPATCH_H +#define GMX_NBNXM_FREEENERGYDISPATCH_H + +#include + +#include "gromacs/math/vectypes.h" +#include "gromacs/mdtypes/enerdata.h" +#include "gromacs/mdtypes/threaded_force_buffer.h" +#include "gromacs/utility/arrayref.h" + +struct gmx_enerdata_t; +struct gmx_wallcycle; +struct interaction_const_t; +class PairlistSets; +struct t_lambda; +struct t_nrnb; + +namespace gmx +{ +template +class ArrayRefWithPadding; +class ForceWithShiftForces; +class StepWorkload; +} // namespace gmx + +/*! \libinternal + * \brief Temporary data and methods for handling dispatching of the nbnxm free-energy kernels + */ +class FreeEnergyDispatch +{ +public: + //! Constructor + FreeEnergyDispatch(int numEnergyGroups); + + //! Sets up the threaded force buffer and the reduction, should be called after constructing the pair lists + void setupFepThreadedForceBuffer(int numAtomsForce, const PairlistSets& pairlistSets); + + //! Dispatches the non-bonded free-energy kernels, thread parallel and reduces the output + void dispatchFreeEnergyKernels(const PairlistSets& pairlistSets, + const gmx::ArrayRefWithPadding& coords, + gmx::ForceWithShiftForces* forceWithShiftForces, + bool useSimd, + int ntype, + real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + t_lambda* fepvals, + gmx::ArrayRef lambda, + gmx_enerdata_t* enerd, + const gmx::StepWorkload& stepWork, + t_nrnb* nrnb, + gmx_wallcycle* wcycle); + +private: + //! Temporary array for storing foreign lambda group pair energies + gmx_grppairener_t foreignGroupPairEnergies_; + + //! Threaded force buffer for nonbonded FEP + gmx::ThreadedForceBuffer threadedForceBuffer_; + //! Threaded buffer for nonbonded FEP foreign energies and dVdl, no forces, so numAtoms = 0 + gmx::ThreadedForceBuffer threadedForeignEnergyBuffer_; +}; + +#endif // GMX_NBNXN_FREEENERGYDISPATCH_H diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp index f7229169e3..be105c93ee 100644 --- a/src/gromacs/nbnxm/kerneldispatch.cpp +++ b/src/gromacs/nbnxm/kerneldispatch.cpp @@ -37,8 +37,6 @@ #include "gmxpre.h" #include "gromacs/gmxlib/nrnb.h" -#include "gromacs/gmxlib/nonbonded/nb_free_energy.h" -#include "gromacs/gmxlib/nonbonded/nonbonded.h" #include "gromacs/math/vectypes.h" #include "gromacs/mdlib/enerdata_utils.h" #include "gromacs/mdlib/force.h" @@ -497,167 +495,3 @@ void nonbonded_verlet_t::dispatchNonbondedKernel(gmx::InteractionLocality accountFlops(nrnb, pairlistSet, *this, ic, stepWork); } - -void nonbonded_verlet_t::dispatchFreeEnergyKernel(gmx::InteractionLocality iLocality, - gmx::ArrayRef coords, - gmx::ForceWithShiftForces* forceWithShiftForces, - bool useSimd, - int ntype, - real rlist, - const interaction_const_t& ic, - gmx::ArrayRef shiftvec, - gmx::ArrayRef nbfp, - gmx::ArrayRef nbfp_grid, - gmx::ArrayRef chargeA, - gmx::ArrayRef chargeB, - gmx::ArrayRef typeA, - gmx::ArrayRef typeB, - t_lambda* fepvals, - gmx::ArrayRef lambda, - gmx_enerdata_t* enerd, - const gmx::StepWorkload& stepWork, - t_nrnb* nrnb) -{ - const auto nbl_fep = pairlistSets().pairlistSet(iLocality).fepLists(); - - /* When the first list is empty, all are empty and there is nothing to do */ - if (!pairlistSets().params().haveFep || nbl_fep[0]->nrj == 0) - { - return; - } - - int donb_flags = 0; - /* Add short-range interactions */ - donb_flags |= GMX_NONBONDED_DO_SR; - - if (stepWork.computeForces) - { - donb_flags |= GMX_NONBONDED_DO_FORCE; - } - if (stepWork.computeVirial) - { - donb_flags |= GMX_NONBONDED_DO_SHIFTFORCE; - } - if (stepWork.computeEnergy) - { - donb_flags |= GMX_NONBONDED_DO_POTENTIAL; - } - - gmx::EnumerationArray dvdl_nb = { 0 }; - int kernelFlags = donb_flags; - gmx::ArrayRef kernelLambda = lambda; - gmx::ArrayRef kernelDvdl = dvdl_nb; - - gmx::ArrayRef energygrp_elec = enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR]; - gmx::ArrayRef energygrp_vdw = enerd->grpp.energyGroupPairTerms[NonBondedEnergyTerms::LJSR]; - - GMX_ASSERT(gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded) == nbl_fep.ssize(), - "Number of lists should be same as number of NB threads"); - - wallcycle_sub_start(wcycle_, WallCycleSubCounter::NonbondedFep); -#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) - for (gmx::index th = 0; th < nbl_fep.ssize(); th++) - { - try - { - gmx_nb_free_energy_kernel(*nbl_fep[th], - coords, - forceWithShiftForces, - useSimd, - ntype, - rlist, - ic, - shiftvec, - nbfp, - nbfp_grid, - chargeA, - chargeB, - typeA, - typeB, - kernelFlags, - kernelLambda, - kernelDvdl, - energygrp_elec, - energygrp_vdw, - nrnb); - } - GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR - } - - if (fepvals->sc_alpha != 0) - { - enerd->dvdl_nonlin[FreeEnergyPerturbationCouplingType::Vdw] += - dvdl_nb[FreeEnergyPerturbationCouplingType::Vdw]; - enerd->dvdl_nonlin[FreeEnergyPerturbationCouplingType::Coul] += - dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]; - } - else - { - enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Vdw] += - dvdl_nb[FreeEnergyPerturbationCouplingType::Vdw]; - enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Coul] += - dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]; - } - - /* If we do foreign lambda and we have soft-core interactions - * we have to recalculate the (non-linear) energies contributions. - */ - if (fepvals->n_lambda > 0 && stepWork.computeDhdl && fepvals->sc_alpha != 0) - { - gmx::EnumerationArray lam_i; - kernelFlags = (donb_flags & ~(GMX_NONBONDED_DO_FORCE | GMX_NONBONDED_DO_SHIFTFORCE)) - | GMX_NONBONDED_DO_FOREIGNLAMBDA; - kernelLambda = lam_i; - kernelDvdl = dvdl_nb; - gmx::ArrayRef energygrp_elec = - foreignEnergyGroups_->energyGroupPairTerms[NonBondedEnergyTerms::CoulombSR]; - gmx::ArrayRef energygrp_vdw = - foreignEnergyGroups_->energyGroupPairTerms[NonBondedEnergyTerms::LJSR]; - - for (gmx::index i = 0; i < 1 + enerd->foreignLambdaTerms.numLambdas(); i++) - { - std::fill(std::begin(dvdl_nb), std::end(dvdl_nb), 0); - for (int j = 0; j < static_cast(FreeEnergyPerturbationCouplingType::Count); j++) - { - lam_i[j] = (i == 0 ? lambda[j] : fepvals->all_lambda[j][i - 1]); - } - foreignEnergyGroups_->clear(); -#pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize()) - for (gmx::index th = 0; th < nbl_fep.ssize(); th++) - { - try - { - gmx_nb_free_energy_kernel(*nbl_fep[th], - coords, - forceWithShiftForces, - useSimd, - ntype, - rlist, - ic, - shiftvec, - nbfp, - nbfp_grid, - chargeA, - chargeB, - typeA, - typeB, - kernelFlags, - kernelLambda, - kernelDvdl, - energygrp_elec, - energygrp_vdw, - nrnb); - } - GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR - } - std::array foreign_term = { 0 }; - sum_epot(*foreignEnergyGroups_, foreign_term.data()); - enerd->foreignLambdaTerms.accumulate( - i, - foreign_term[F_EPOT], - dvdl_nb[FreeEnergyPerturbationCouplingType::Vdw] - + dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]); - } - } - wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NonbondedFep); -} diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp index dca77313ed..f634f75fac 100644 --- a/src/gromacs/nbnxm/nbnxm.cpp +++ b/src/gromacs/nbnxm/nbnxm.cpp @@ -258,5 +258,4 @@ bool buildSupportsNonbondedOnGpu(std::string* error) return errorReasons.isEmpty(); } - /*! \endcond */ diff --git a/src/gromacs/nbnxm/nbnxm.h b/src/gromacs/nbnxm/nbnxm.h index e3e90d9238..b438c9bc33 100644 --- a/src/gromacs/nbnxm/nbnxm.h +++ b/src/gromacs/nbnxm/nbnxm.h @@ -121,6 +121,7 @@ #include "gromacs/utility/real.h" struct DeviceInformation; +class FreeEnergyDispatch; struct gmx_domdec_zones_t; struct gmx_enerdata_t; struct gmx_hw_info_t; @@ -144,6 +145,8 @@ class GpuEventSynchronizer; namespace gmx { +template +class ArrayRefWithPadding; class DeviceStreamManager; class ForceWithShiftForces; class ListedForcesGpu; @@ -368,26 +371,25 @@ public: gmx::ArrayRef CoulombSR, t_nrnb* nrnb) const; - //! Executes the non-bonded free-energy kernel, always runs on the CPU - void dispatchFreeEnergyKernel(gmx::InteractionLocality iLocality, - gmx::ArrayRef coords, - gmx::ForceWithShiftForces* forceWithShiftForces, - bool useSimd, - int ntype, - real rlist, - const interaction_const_t& ic, - gmx::ArrayRef shiftvec, - gmx::ArrayRef nbfp, - gmx::ArrayRef nbfp_grid, - gmx::ArrayRef chargeA, - gmx::ArrayRef chargeB, - gmx::ArrayRef typeA, - gmx::ArrayRef typeB, - t_lambda* fepvals, - gmx::ArrayRef lambda, - gmx_enerdata_t* enerd, - const gmx::StepWorkload& stepWork, - t_nrnb* nrnb); + //! Executes the non-bonded free-energy kernels, local + non-local, always runs on the CPU + void dispatchFreeEnergyKernels(const gmx::ArrayRefWithPadding& coords, + gmx::ForceWithShiftForces* forceWithShiftForces, + bool useSimd, + int ntype, + real rlist, + const interaction_const_t& ic, + gmx::ArrayRef shiftvec, + gmx::ArrayRef nbfp, + gmx::ArrayRef nbfp_grid, + gmx::ArrayRef chargeA, + gmx::ArrayRef chargeB, + gmx::ArrayRef typeA, + gmx::ArrayRef typeB, + t_lambda* fepvals, + gmx::ArrayRef lambda, + gmx_enerdata_t* enerd, + const gmx::StepWorkload& stepWork, + t_nrnb* nrnb); /*! \brief Add the forces stored in nbat to f, zeros the forces in nbat * \param [in] locality Local or non-local @@ -418,6 +420,8 @@ public: void setupGpuShortRangeWork(const gmx::ListedForcesGpu* listedForcesGpu, gmx::InteractionLocality iLocality) const; + void setupFepThreadedForceBuffer(int numAtomsForce); + // TODO: Make all data members private //! All data related to the pair lists std::unique_ptr pairlistSets_; @@ -429,10 +433,12 @@ public: private: //! The non-bonded setup, also affects the pairlist construction kernel Nbnxm::KernelSetup kernelSetup_; + //! \brief Pointer to wallcycle structure. gmx_wallcycle* wcycle_; - //! Temporary array for storing foreign lambda group pair energies - std::unique_ptr foreignEnergyGroups_; + + //! \brief The non-bonded free-energy kernel dispatcher + std::unique_ptr freeEnergyDispatch_; public: //! GPU Nbnxm data, only used with a physical GPU (TODO: use unique_ptr) diff --git a/src/gromacs/nbnxm/nbnxm_setup.cpp b/src/gromacs/nbnxm/nbnxm_setup.cpp index b279596730..3b448938fb 100644 --- a/src/gromacs/nbnxm/nbnxm_setup.cpp +++ b/src/gromacs/nbnxm/nbnxm_setup.cpp @@ -62,6 +62,7 @@ #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/logger.h" +#include "freeenergydispatch.h" #include "grid.h" #include "nbnxm_geometry.h" #include "nbnxm_simd.h" @@ -485,12 +486,16 @@ nonbonded_verlet_t::nonbonded_verlet_t(std::unique_ptr pairlis nbat(std::move(nbat_in)), kernelSetup_(kernelSetup), wcycle_(wcycle), - foreignEnergyGroups_(std::make_unique(nbat->params().nenergrp)), gpu_nbv(gpu_nbv_ptr) { GMX_RELEASE_ASSERT(pairlistSets_, "Need valid pairlistSets"); GMX_RELEASE_ASSERT(pairSearch_, "Need valid search object"); GMX_RELEASE_ASSERT(nbat, "Need valid atomdata object"); + + if (pairlistSets_->params().haveFep) + { + freeEnergyDispatch_ = std::make_unique(nbat->params().nenergrp); + } } nonbonded_verlet_t::~nonbonded_verlet_t() diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp index cd6f432485..12d5f376d0 100644 --- a/src/gromacs/timing/wallcycle.cpp +++ b/src/gromacs/timing/wallcycle.cpp @@ -149,6 +149,7 @@ static const char* enumValuetoString(WallCycleSubCounter enumValue) "Nonbonded F kernel", "Nonbonded F clear", "Nonbonded FEP", + "Nonbonded FEP reduction", "Launch NB GPU tasks", "Launch Bonded GPU tasks", "Launch PME GPU tasks", diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h index 60669d3fbc..ca256607c5 100644 --- a/src/gromacs/timing/wallcycle.h +++ b/src/gromacs/timing/wallcycle.h @@ -142,6 +142,7 @@ enum class WallCycleSubCounter : int NonbondedKernel, NonbondedClear, NonbondedFep, + NonbondedFepReduction, LaunchGpuNonBonded, LaunchGpuBonded, LaunchGpuPme, -- 2.22.0