From f7be07e3cc901eb03700d93248fc09b573370282 Mon Sep 17 00:00:00 2001 From: Magnus Lundborg Date: Mon, 20 Jul 2020 10:17:02 +0000 Subject: [PATCH] Two sets of coefficients for Coulomb FEP PME on GPU The first patch in a series to enable running Coulomb FEP PME on GPU. Use two sets of coefficients to store atom charges. Refs #2054, #3117 Change-Id: Iab6eb7ac766800f7c045dc5a00069e77509d391f --- docs/release-notes/2021/major/performance.rst | 5 + docs/user-guide/mdrun-performance.rst | 3 - src/gromacs/domdec/mdsetup.cpp | 2 +- src/gromacs/ewald/pme.cpp | 22 +- src/gromacs/ewald/pme.h | 32 +- src/gromacs/ewald/pme_gather.clh | 240 ++++++-- src/gromacs/ewald/pme_gather.cu | 295 ++++++--- src/gromacs/ewald/pme_gpu.cpp | 104 ++-- src/gromacs/ewald/pme_gpu_3dfft.cu | 6 +- src/gromacs/ewald/pme_gpu_3dfft.h | 3 +- src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp | 6 +- .../ewald/pme_gpu_calculate_splines.cuh | 15 +- src/gromacs/ewald/pme_gpu_internal.cpp | 579 +++++++++++++----- src/gromacs/ewald/pme_gpu_internal.h | 94 ++- src/gromacs/ewald/pme_gpu_program_impl.cu | 114 ++-- src/gromacs/ewald/pme_gpu_program_impl.h | 54 +- .../ewald/pme_gpu_program_impl_ocl.cpp | 105 +++- src/gromacs/ewald/pme_gpu_staging.h | 9 +- src/gromacs/ewald/pme_gpu_types.h | 20 +- src/gromacs/ewald/pme_gpu_types_host.h | 2 +- src/gromacs/ewald/pme_gpu_types_host_impl.h | 21 +- src/gromacs/ewald/pme_only.cpp | 10 +- src/gromacs/ewald/pme_output.h | 4 + src/gromacs/ewald/pme_program.cl | 115 +++- src/gromacs/ewald/pme_solve.clh | 8 +- src/gromacs/ewald/pme_solve.cu | 42 +- src/gromacs/ewald/pme_spread.clh | 40 +- src/gromacs/ewald/pme_spread.cu | 75 ++- src/gromacs/ewald/tests/pmetestcommon.cpp | 47 +- src/gromacs/mdlib/mdatoms.cpp | 30 +- src/gromacs/mdlib/mdatoms.h | 20 +- src/gromacs/mdlib/sim_util.cpp | 30 +- src/gromacs/mdrun/runner.cpp | 4 +- src/gromacs/mdrun/tpi.cpp | 2 +- src/gromacs/taskassignment/decidegpuusage.cpp | 7 +- src/gromacs/taskassignment/decidegpuusage.h | 4 - .../taskassignment/resourcedivision.cpp | 2 +- 37 files changed, 1540 insertions(+), 631 deletions(-) diff --git a/docs/release-notes/2021/major/performance.rst b/docs/release-notes/2021/major/performance.rst index 0931f3843f..d486d0c308 100644 --- a/docs/release-notes/2021/major/performance.rst +++ b/docs/release-notes/2021/major/performance.rst @@ -20,3 +20,8 @@ The time `gmx grompp` spent processing distance restraint has been changed from quadratic in the number of restraints to linear. :issue:`3457` + +Support for offloading PME to GPU when doing Coulomb FEP +"""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +PME calculations can be offloaded to GPU when doing Coulomb free-energy perturbations. diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst index d14e4fe43d..1309ac0806 100644 --- a/docs/user-guide/mdrun-performance.rst +++ b/docs/user-guide/mdrun-performance.rst @@ -1093,9 +1093,6 @@ Known limitations - Only single precision is supported. -- Free energy calculations where charges are perturbed are not supported, - because only single PME grids can be calculated. - - Only dynamical integrators are supported (ie. leap-frog, Velocity Verlet, stochastic dynamics) diff --git a/src/gromacs/domdec/mdsetup.cpp b/src/gromacs/domdec/mdsetup.cpp index 2c8027b7f2..9c71dd410c 100644 --- a/src/gromacs/domdec/mdsetup.cpp +++ b/src/gromacs/domdec/mdsetup.cpp @@ -137,7 +137,7 @@ void mdAlgorithmsSetupAtomData(const t_commrec* cr, * For PME-only ranks, gmx_pmeonly() has its own call to gmx_pme_reinit_atoms(). */ const int numPmeAtoms = numHomeAtoms - fr->n_tpi; - gmx_pme_reinit_atoms(fr->pmedata, numPmeAtoms, mdatoms->chargeA); + gmx_pme_reinit_atoms(fr->pmedata, numPmeAtoms, mdatoms->chargeA, mdatoms->chargeB); } if (constr) diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index 0f59ebe63e..f26c2f6fe3 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -179,7 +179,7 @@ bool pme_gpu_supports_hardware(const gmx_hw_info_t gmx_unused& hwinfo, std::stri return addMessageIfNotSupported(errorReasons, error); } -bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::string* error) +bool pme_gpu_supports_input(const t_inputrec& ir, std::string* error) { std::list errorReasons; if (!EEL_PME(ir.coulombtype)) @@ -190,14 +190,6 @@ bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::s { errorReasons.emplace_back("interpolation orders other than 4"); } - if (ir.efep != efepNO) - { - if (gmx_mtop_has_perturbed_charges(mtop)) - { - errorReasons.emplace_back( - "free energy calculations with perturbed charges (multiple grids)"); - } - } if (EVDW_PME(ir.vdwtype)) { errorReasons.emplace_back("Lennard-Jones PME"); @@ -231,10 +223,6 @@ static bool pme_gpu_check_restrictions(const gmx_pme_t* pme, std::string* error) { errorReasons.emplace_back("interpolation orders other than 4"); } - if (pme->bFEP) - { - errorReasons.emplace_back("free energy calculations (multiple grids)"); - } if (pme->doLJ) { errorReasons.emplace_back("Lennard-Jones PME"); @@ -939,7 +927,7 @@ void gmx_pme_reinit(struct gmx_pme_t** pmedata, */ if (!pme_src->gpu && pme_src->nnodes == 1) { - gmx_pme_reinit_atoms(*pmedata, pme_src->atc[0].numAtoms(), nullptr); + gmx_pme_reinit_atoms(*pmedata, pme_src->atc[0].numAtoms(), nullptr, nullptr); } // TODO this is mostly passing around current values } @@ -1703,11 +1691,13 @@ void gmx_pme_destroy(gmx_pme_t* pme) delete pme; } -void gmx_pme_reinit_atoms(gmx_pme_t* pme, const int numAtoms, const real* charges) +void gmx_pme_reinit_atoms(gmx_pme_t* pme, const int numAtoms, const real* chargesA, const real* chargesB) { if (pme->gpu != nullptr) { - pme_gpu_reinit_atoms(pme->gpu, numAtoms, charges); + GMX_ASSERT(!(pme->bFEP_q && chargesB == nullptr), + "B state charges must be specified if running Coulomb FEP on the GPU"); + pme_gpu_reinit_atoms(pme->gpu, numAtoms, chargesA, pme->bFEP_q ? chargesB : nullptr); } else { diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index d60f2e6326..0408b87f77 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -219,9 +219,12 @@ void gmx_pme_calc_energy(gmx_pme_t* pme, gmx::ArrayRef x, gmx:: * * \param[in,out] pme The PME structure. * \param[in] numAtoms The number of particles. - * \param[in] charges The pointer to the array of particle charges. + * \param[in] chargesA The pointer to the array of particle charges in the normal state or FEP + * state A. Can be nullptr if PME is not performed on the GPU. + * \param[in] chargesB The pointer to the array of particle charges in state B. Only used if + * charges are perturbed and can otherwise be nullptr. */ -void gmx_pme_reinit_atoms(gmx_pme_t* pme, int numAtoms, const real* charges); +void gmx_pme_reinit_atoms(gmx_pme_t* pme, int numAtoms, const real* chargesA, const real* chargesB); /* A block of PME GPU functions */ @@ -251,12 +254,11 @@ bool pme_gpu_supports_hardware(const gmx_hw_info_t& hwinfo, std::string* error); * formed gmx_pme_t structure. Should that one go away/work with inputrec? * * \param[in] ir Input system. - * \param[in] mtop Complete system topology to check if an FE simulation perturbs charges. * \param[out] error If non-null, the error message if the input is not supported on GPU. * * \returns true if PME can run on GPU with this input, false otherwise. */ -bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::string* error); +bool pme_gpu_supports_input(const t_inputrec& ir, std::string* error); /*! \brief * Returns the active PME codepath (CPU, GPU, mixed). @@ -333,12 +335,16 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t* GPU_FUNC_ARGU * Launches first stage of PME on GPU - spreading kernel. * * \param[in] pme The PME data structure. - * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory; nullptr allowed only on separate PME ranks. + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates + * are ready in the device memory; nullptr allowed only on separate PME ranks. * \param[in] wcycle The wallclock counter. + * \param[in] lambdaQ The Coulomb lambda of the current state of the + * system. Only used if FEP of Coulomb is active. */ GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t* GPU_FUNC_ARGUMENT(pme), GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice), - gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM; + gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle), + const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM; /*! \brief * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode. @@ -355,11 +361,13 @@ pme_gpu_launch_complex_transforms(gmx_pme_t* GPU_FUNC_ARGUMENT(pme /*! \brief * Launches last stage of PME on GPU - force gathering and D2H force transfer. * - * \param[in] pme The PME data structure. - * \param[in] wcycle The wallclock counter. + * \param[in] pme The PME data structure. + * \param[in] wcycle The wallclock counter. + * \param[in] lambdaQ The Coulomb lambda to use when calculating the results. */ GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme), - gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM; + gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle), + const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM; /*! \brief * Attempts to complete PME GPU tasks. @@ -376,6 +384,7 @@ GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t* GPU_FUNC_ARGUMENT * \param[in] wcycle The wallclock counter. * \param[out] forceWithVirial The output force and virial * \param[out] enerd The output energies + * \param[in] lambdaQ The Coulomb lambda to use when calculating the results. * \param[in] completionKind Indicates whether PME task completion should only be checked rather * than waited for * \returns True if the PME GPU tasks have completed @@ -385,6 +394,7 @@ GPU_FUNC_QUALIFIER bool pme_gpu_try_finish_task(gmx_pme_t* GPU_FUN gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle), gmx::ForceWithVirial* GPU_FUNC_ARGUMENT(forceWithVirial), gmx_enerdata_t* GPU_FUNC_ARGUMENT(enerd), + const real GPU_FUNC_ARGUMENT(lambdaQ), GpuTaskCompletion GPU_FUNC_ARGUMENT(completionKind)) GPU_FUNC_TERM_WITH_RETURN(false); @@ -397,12 +407,14 @@ GPU_FUNC_QUALIFIER bool pme_gpu_try_finish_task(gmx_pme_t* GPU_FUN * \param[in] wcycle The wallclock counter. * \param[out] forceWithVirial The output force and virial * \param[out] enerd The output energies + * \param[in] lambdaQ The Coulomb lambda to use when calculating the results. */ GPU_FUNC_QUALIFIER void pme_gpu_wait_and_reduce(gmx_pme_t* GPU_FUNC_ARGUMENT(pme), const gmx::StepWorkload& GPU_FUNC_ARGUMENT(stepWork), gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle), gmx::ForceWithVirial* GPU_FUNC_ARGUMENT(forceWithVirial), - gmx_enerdata_t* GPU_FUNC_ARGUMENT(enerd)) GPU_FUNC_TERM; + gmx_enerdata_t* GPU_FUNC_ARGUMENT(enerd), + const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM; /*! \brief * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing. diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh index 2d51020763..8c7106acf9 100644 --- a/src/gromacs/ewald/pme_gather.clh +++ b/src/gromacs/ewald/pme_gather.clh @@ -184,6 +184,86 @@ inline void reduce_atom_forces(__local float* __restrict__ sm_forces, } } +/*! \brief Calculate the sum of the force partial components (in X, Y and Z) + * + * \param[out] fx The force partial component in the X dimension. + * \param[out] fy The force partial component in the Y dimension. + * \param[out] fz The force partial component in the Z dimension. + * \param[in] ixBase The grid line index base value in the X dimension. + * \param[in] nx The grid real size in the X dimension. + * \param[in] pny The padded grid real size in the Y dimension. + * \param[in] pnz The padded grid real size in the Z dimension. + * \param[in] constOffset The offset to calculate the global grid index. + * \param[in] splineIndexBase The base value of the spline parameter index. + * \param[in] tdy The theta and dtheta in the Y dimension. + * \param[in] tdz The theta and dtheta in the Z dimension. + * \param[in] sm_splineParams Shared memory array of spline parameters. + * \param[in] gm_grid Global memory array of the grid to use. + */ +inline void sumForceComponents(float* fx, + float* fy, + float* fz, + const int ixBase, + const int nx, + const int pny, + const int pnz, + const int constOffset, + const int splineIndexBase, + const float2 tdy, + const float2 tdz, + __local const float2* __restrict__ sm_splineParams, + __global const float* __restrict__ gm_grid) +{ +# pragma unroll order + for (int ithx = 0; (ithx < order); ithx++) + { + int ix = ixBase + ithx; + if (wrapX & (ix >= nx)) + { + ix -= nx; + } + const int gridIndexGlobal = ix * pny * pnz + constOffset; + assert(gridIndexGlobal >= 0); + const float gridValue = gm_grid[gridIndexGlobal]; + assert(isfinite(gridValue)); + const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx); + const float2 tdx = sm_splineParams[splineIndexX]; + const float fxy1 = tdz.x * gridValue; + const float fz1 = tdz.y * gridValue; + *fx += tdx.y * tdy.x * fxy1; + *fy += tdx.x * tdy.y * fxy1; + *fz += tdx.x * tdy.x * fz1; + } +} + +/*! \brief Calculate the grid forces and store them in shared memory. + * + * \param[in,out] sm_forces Shared memory array with the output forces. + * \param[in] forceIndexLocal The local (per thread) index in the sm_forces array. + * \param[in] forceIndexGlobal The index of the thread in the gm_coefficients array. + * \param[in] recipBox The reciprocal box. + * \param[in] scale The scale to use when calculating the forces (only relevant when + * using two grids). + * \param[in] gm_coefficients Global memory array of the coefficients to use. + */ +inline void calculateAndStoreGridForces(__local float* __restrict__ sm_forces, + const int forceIndexLocal, + const int forceIndexGlobal, + const float recipBox[DIM][DIM], + const float scale, + __global const float* __restrict__ gm_coefficients) +{ + const float3 atomForces = vload3(forceIndexLocal, sm_forces); + float negCoefficient = -scale * gm_coefficients[forceIndexGlobal]; + float3 result; + result.x = negCoefficient * recipBox[XX][XX] * atomForces.x; + result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y); + result.z = negCoefficient + * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y + + recipBox[ZZ][ZZ] * atomForces.z); + vstore3(result, forceIndexLocal, sm_forces); +} + #endif // COMPILE_GATHER_HELPERS_ONCE /*! \brief @@ -192,8 +272,15 @@ inline void reduce_atom_forces(__local float* __restrict__ sm_forces, * Please see the file description for additional defines which this kernel expects. * * \param[in] kernelParams All the PME GPU data. - * \param[in] gm_coefficients Atom charges/coefficients. - * \param[in] gm_grid Global 3D grid. + * \param[in] gm_coefficientsA Atom charges/coefficients in the unperturbed state, or FEP + * state A. + * \param[in] gm_coefficientsB Atom charges/coefficients in FEP state B. Only used + * when spreading interpolated coefficients on one grid or spreading two sets of coefficients on two + * separate grids. + * \param[in] gm_gridA Global 3D grid for the unperturbed state, FEP + * state A or the single grid used for interpolated coefficients on one grid in FEP A/B. + * \param[in] gm_gridB Global 3D grid for FEP state B when using dual + * grids (when calculating energy and virials). * \param[in] gm_theta Atom spline parameter values * \param[in] gm_dtheta Atom spline parameter derivatives * \param[in] gm_gridlineIndices Atom gridline indices (ivec) @@ -201,13 +288,17 @@ inline void reduce_atom_forces(__local float* __restrict__ sm_forces, */ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams, - __global const float* __restrict__ gm_coefficients, - __global const float* __restrict__ gm_grid, + __global const float* __restrict__ gm_coefficientsA, + __global const float* __restrict__ gm_coefficientsB, + __global const float* __restrict__ gm_gridA, + __global const float* __restrict__ gm_gridB, __global const float* __restrict__ gm_theta, __global const float* __restrict__ gm_dtheta, __global const int* __restrict__ gm_gridlineIndices, __global float* __restrict__ gm_forces) { + assert(numGrids == 1 || numGrids == 2); + /* These are the atom indices - for the shared and global memory */ const int atomIndexLocal = get_local_id(ZZ); const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock; @@ -267,58 +358,40 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe float fy = 0.0F; float fz = 0.0F; - const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]); + int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]); - if (chargeCheck) + const int nx = kernelParams.grid.realGridSize[XX]; + const int ny = kernelParams.grid.realGridSize[YY]; + const int nz = kernelParams.grid.realGridSize[ZZ]; + const int pny = kernelParams.grid.realGridSizePadded[YY]; + const int pnz = kernelParams.grid.realGridSizePadded[ZZ]; + + const int atomWarpIndex = atomIndexLocal % atomsPerWarp; + const int warpIndex = atomIndexLocal / atomsPerWarp; + + const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); + const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); + const float2 tdy = sm_splineParams[splineIndexY]; + const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); + const float2 tdz = sm_splineParams[splineIndexZ]; + + const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX]; + int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy; + if (wrapY & (iy >= ny)) { - const int nx = kernelParams.grid.realGridSize[XX]; - const int ny = kernelParams.grid.realGridSize[YY]; - const int nz = kernelParams.grid.realGridSize[ZZ]; - const int pny = kernelParams.grid.realGridSizePadded[YY]; - const int pnz = kernelParams.grid.realGridSizePadded[ZZ]; - - const int atomWarpIndex = atomIndexLocal % atomsPerWarp; - const int warpIndex = atomIndexLocal / atomsPerWarp; - - const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); - const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); - const float2 tdy = sm_splineParams[splineIndexY]; - const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); - const float2 tdz = sm_splineParams[splineIndexZ]; - - const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX]; - int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy; - if (wrapY & (iy >= ny)) - { - iy -= ny; - } - int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz; - if (iz >= nz) - { - iz -= nz; - } - const int constOffset = iy * pnz + iz; + iy -= ny; + } + int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz; + if (iz >= nz) + { + iz -= nz; + } + const int constOffset = iy * pnz + iz; -#pragma unroll order - for (int ithx = 0; (ithx < order); ithx++) - { - int ix = ixBase + ithx; - if (wrapX & (ix >= nx)) - { - ix -= nx; - } - const int gridIndexGlobal = ix * pny * pnz + constOffset; - assert(gridIndexGlobal >= 0); - const float gridValue = gm_grid[gridIndexGlobal]; - assert(isfinite(gridValue)); - const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx); - const float2 tdx = sm_splineParams[splineIndexX]; - const float fxy1 = tdz.x * gridValue; - const float fz1 = tdz.y * gridValue; - fx += tdx.y * tdy.x * fxy1; - fy += tdx.x * tdy.y * fxy1; - fz += tdx.x * tdy.x * fz1; - } + if (chargeCheck) + { + sumForceComponents(&fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy, + tdz, sm_splineParams, gm_gridA); } // Reduction of partial force contributions @@ -332,22 +405,13 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe barrier(CLK_LOCAL_MEM_FENCE); /* Calculating the final forces with no component branching, atomsPerBlock threads */ - const int forceIndexLocal = threadLocalId; - const int forceIndexGlobal = atomIndexOffset + forceIndexLocal; + const int forceIndexLocal = threadLocalId; + const int forceIndexGlobal = atomIndexOffset + forceIndexLocal; + const float scale = kernelParams.current.scale; if (forceIndexLocal < atomsPerBlock) { - const float3 atomForces = vload3(forceIndexLocal, sm_forces); - const float negCoefficient = -gm_coefficients[forceIndexGlobal]; - float3 result; - result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x; - result.y = negCoefficient - * (kernelParams.current.recipBox[XX][YY] * atomForces.x - + kernelParams.current.recipBox[YY][YY] * atomForces.y); - result.z = negCoefficient - * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x - + kernelParams.current.recipBox[YY][ZZ] * atomForces.y - + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z); - vstore3(result, forceIndexLocal, sm_forces); + calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal, + kernelParams.current.recipBox, scale, gm_coefficientsA); } #if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_) @@ -374,4 +438,46 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKe gm_forces[outputIndexGlobal] = outputForceComponent; } } + + if (numGrids == 2) + { + barrier(CLK_LOCAL_MEM_FENCE); + fx = 0.0f; + fy = 0.0f; + fz = 0.0f; + chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]); + if (chargeCheck) + { + sumForceComponents(&fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, + tdy, tdz, sm_splineParams, gm_gridB); + } + reduce_atom_forces(sm_forces, atomIndexLocal, splineIndex, lineIndex, + kernelParams.grid.realGridSizeFP, fx, fy, fz, sm_forceReduction, sm_forceTemp); + barrier(CLK_LOCAL_MEM_FENCE); + if (forceIndexLocal < atomsPerBlock) + { + calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal, + kernelParams.current.recipBox, 1.0F - scale, gm_coefficientsB); + } + +#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_) + /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was + * __syncwarp() in CUDA. #2519 + */ + barrier(CLK_LOCAL_MEM_FENCE); +#endif + + /* Writing or adding the final forces component-wise, single warp */ + if (threadLocalId < iterThreads) + { +#pragma unroll + for (int i = 0; i < numIter; i++) + { + const int outputIndexLocal = i * iterThreads + threadLocalId; + const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal; + const float outputForceComponent = sm_forces[outputIndexLocal]; + gm_forces[outputIndexGlobal] += outputForceComponent; + } + } + } } diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu index 2edec547ef..52814c4ce4 100644 --- a/src/gromacs/ewald/pme_gather.cu +++ b/src/gromacs/ewald/pme_gather.cu @@ -213,6 +213,119 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force } } +/*! \brief Calculate the sum of the force partial components (in X, Y and Z) + * + * \tparam[in] order The PME order (must be 4). + * \tparam[in] atomsPerWarp The number of atoms per GPU warp. + * \tparam[in] wrapX Tells if the grid is wrapped in the X dimension. + * \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension. + * \param[out] fx The force partial component in the X dimension. + * \param[out] fy The force partial component in the Y dimension. + * \param[out] fz The force partial component in the Z dimension. + * \param[in] ithyMin The thread minimum index in the Y dimension. + * \param[in] ithyMax The thread maximum index in the Y dimension. + * \param[in] ixBase The grid line index base value in the X dimension. + * \param[in] iz The grid line index in the Z dimension. + * \param[in] nx The grid real size in the X dimension. + * \param[in] ny The grid real size in the Y dimension. + * \param[in] pny The padded grid real size in the Y dimension. + * \param[in] pnz The padded grid real size in the Z dimension. + * \param[in] atomIndexLocal The atom index for this thread. + * \param[in] splineIndexBase The base value of the spline parameter index. + * \param[in] tdz The theta and dtheta in the Z dimension. + * \param[in] sm_gridlineIndices Shared memory array of grid line indices. + * \param[in] sm_theta Shared memory array of atom theta values. + * \param[in] sm_dtheta Shared memory array of atom dtheta values. + * \param[in] gm_grid Global memory array of the grid to use. + */ +template +__device__ __forceinline__ void sumForceComponents(float* __restrict__ fx, + float* __restrict__ fy, + float* __restrict__ fz, + const int ithyMin, + const int ithyMax, + const int ixBase, + const int iz, + const int nx, + const int ny, + const int pny, + const int pnz, + const int atomIndexLocal, + const int splineIndexBase, + const float2 tdz, + const int* __restrict__ sm_gridlineIndices, + const float* __restrict__ sm_theta, + const float* __restrict__ sm_dtheta, + const float* __restrict__ gm_grid) +{ +#pragma unroll + for (int ithy = ithyMin; ithy < ithyMax; ithy++) + { + const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); + const float2 tdy = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]); + + int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy; + if (wrapY & (iy >= ny)) + { + iy -= ny; + } + const int constOffset = iy * pnz + iz; + +#pragma unroll + for (int ithx = 0; (ithx < order); ithx++) + { + int ix = ixBase + ithx; + if (wrapX & (ix >= nx)) + { + ix -= nx; + } + const int gridIndexGlobal = ix * pny * pnz + constOffset; + assert(gridIndexGlobal >= 0); + const float gridValue = gm_grid[gridIndexGlobal]; + assert(isfinite(gridValue)); + const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx); + const float2 tdx = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]); + const float fxy1 = tdz.x * gridValue; + const float fz1 = tdz.y * gridValue; + *fx += tdx.y * tdy.x * fxy1; + *fy += tdx.x * tdy.y * fxy1; + *fz += tdx.x * tdy.x * fz1; + } + } +} + +/*! \brief Calculate the grid forces and store them in shared memory. + * + * \param[in,out] sm_forces Shared memory array with the output forces. + * \param[in] forceIndexLocal The local (per thread) index in the sm_forces array. + * \param[in] forceIndexGlobal The index of the thread in the gm_coefficients array. + * \param[in] recipBox The reciprocal box. + * \param[in] scale The scale to use when calculating the forces. For gm_coefficientsB + * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale). + * \param[in] gm_coefficients Global memory array of the coefficients to use for an unperturbed + * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two + * separate grids are used this should be the coefficients of the grid in question. + * \param[in] gm_coefficientsB Global memory array of the coefficients to use for FEP in state B. + * Should be nullptr if two separate grids are used. + */ +__device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces, + const int forceIndexLocal, + const int forceIndexGlobal, + const float recipBox[DIM][DIM], + const float scale, + const float* __restrict__ gm_coefficients) +{ + const float3 atomForces = sm_forces[forceIndexLocal]; + float negCoefficient = -scale * gm_coefficients[forceIndexGlobal]; + float3 result; + result.x = negCoefficient * recipBox[XX][XX] * atomForces.x; + result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y); + result.z = negCoefficient + * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y + + recipBox[ZZ][ZZ] * atomForces.z); + sm_forces[forceIndexLocal] = result; +} + /*! \brief * A CUDA kernel which gathers the atom forces from the grid. * The grid is assumed to be wrapped in dimension Z. @@ -220,19 +333,24 @@ __device__ __forceinline__ void reduce_atom_forces(float3* __restrict__ sm_force * \tparam[in] order The PME order (must be 4 currently). * \tparam[in] wrapX Tells if the grid is wrapped in the X dimension. * \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension. + * \tparam[in] numGrids The number of grids to use in the kernel. Can be 1 or 2. * \tparam[in] readGlobal Tells if we should read spline values from global memory * \tparam[in] threadsPerAtom How many threads work on each atom * * \param[in] kernelParams All the PME GPU data. */ -template +template __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams) { + assert(numGrids == 1 || numGrids == 2); + /* Global memory pointers */ - const float* __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients; - const float* __restrict__ gm_grid = kernelParams.grid.d_realGrid; - float* __restrict__ gm_forces = kernelParams.atoms.d_forces; + const float* __restrict__ gm_coefficientsA = kernelParams.atoms.d_coefficients[0]; + const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1]; + const float* __restrict__ gm_gridA = kernelParams.grid.d_realGrid[0]; + const float* __restrict__ gm_gridB = kernelParams.grid.d_realGrid[1]; + float* __restrict__ gm_forces = kernelParams.atoms.d_forces; /* Global memory pointers for readGlobal */ const float* __restrict__ gm_theta = kernelParams.atoms.d_theta; @@ -328,7 +446,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ // Coordinates __shared__ float3 sm_coordinates[atomsPerBlock]; /* Staging coefficients/charges */ - pme_gpu_stage_atom_data(sm_coefficients, gm_coefficients); + pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsA); /* Staging coordinates */ pme_gpu_stage_atom_data(sm_coordinates, gm_coordinates); @@ -339,7 +457,7 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ else { atomX = gm_coordinates[atomIndexGlobal]; - atomCharge = gm_coefficients[atomIndexGlobal]; + atomCharge = gm_coefficientsA[atomIndexGlobal]; } calculate_splines( kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices); @@ -349,70 +467,37 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ float fy = 0.0f; float fz = 0.0f; - const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]); - - if (chargeCheck) - { - const int nx = kernelParams.grid.realGridSize[XX]; - const int ny = kernelParams.grid.realGridSize[YY]; - const int nz = kernelParams.grid.realGridSize[ZZ]; - const int pny = kernelParams.grid.realGridSizePadded[YY]; - const int pnz = kernelParams.grid.realGridSizePadded[ZZ]; - - const int atomWarpIndex = atomIndexLocal % atomsPerWarp; - const int warpIndex = atomIndexLocal / atomsPerWarp; + const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]); - const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); - const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); - const float2 tdz = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]); + const int nx = kernelParams.grid.realGridSize[XX]; + const int ny = kernelParams.grid.realGridSize[YY]; + const int nz = kernelParams.grid.realGridSize[ZZ]; + const int pny = kernelParams.grid.realGridSizePadded[YY]; + const int pnz = kernelParams.grid.realGridSizePadded[ZZ]; - int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz; - const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX]; + const int atomWarpIndex = atomIndexLocal % atomsPerWarp; + const int warpIndex = atomIndexLocal / atomsPerWarp; - if (iz >= nz) - { - iz -= nz; - } - int constOffset, iy; + const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex); + const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz); + const float2 tdz = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]); - const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y; - const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1; - for (int ithy = ithyMin; ithy < ithyMax; ithy++) - { - const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy); - const float2 tdy = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]); + int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz; + const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX]; - iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy; - if (wrapY & (iy >= ny)) - { - iy -= ny; - } - constOffset = iy * pnz + iz; - -#pragma unroll - for (int ithx = 0; (ithx < order); ithx++) - { - int ix = ixBase + ithx; - if (wrapX & (ix >= nx)) - { - ix -= nx; - } - const int gridIndexGlobal = ix * pny * pnz + constOffset; - assert(gridIndexGlobal >= 0); - const float gridValue = gm_grid[gridIndexGlobal]; - assert(isfinite(gridValue)); - const int splineIndexX = - getSplineParamIndex(splineIndexBase, XX, ithx); - const float2 tdx = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]); - const float fxy1 = tdz.x * gridValue; - const float fz1 = tdz.y * gridValue; - fx += tdx.y * tdy.x * fxy1; - fy += tdx.x * tdy.y * fxy1; - fz += tdx.x * tdy.x * fz1; - } - } + if (iz >= nz) + { + iz -= nz; } + const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y; + const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1; + if (chargeCheck) + { + sumForceComponents( + &fx, &fy, &fz, ithyMin, ithyMax, ixBase, iz, nx, ny, pny, pnz, atomIndexLocal, + splineIndexBase, tdz, sm_gridlineIndices, sm_theta, sm_dtheta, gm_gridA); + } // Reduction of partial force contributions __shared__ float3 sm_forces[atomsPerBlock]; reduce_atom_forces(sm_forces, atomIndexLocal, splineIndex, lineIndex, @@ -420,22 +505,13 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ __syncthreads(); /* Calculating the final forces with no component branching, atomsPerBlock threads */ - const int forceIndexLocal = threadLocalId; - const int forceIndexGlobal = atomIndexOffset + forceIndexLocal; + const int forceIndexLocal = threadLocalId; + const int forceIndexGlobal = atomIndexOffset + forceIndexLocal; + const float scale = kernelParams.current.scale; if (forceIndexLocal < atomsPerBlock) { - const float3 atomForces = sm_forces[forceIndexLocal]; - const float negCoefficient = -gm_coefficients[forceIndexGlobal]; - float3 result; - result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x; - result.y = negCoefficient - * (kernelParams.current.recipBox[XX][YY] * atomForces.x - + kernelParams.current.recipBox[YY][YY] * atomForces.y); - result.z = negCoefficient - * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x - + kernelParams.current.recipBox[YY][ZZ] * atomForces.y - + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z); - sm_forces[forceIndexLocal] = result; + calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal, + kernelParams.current.recipBox, scale, gm_coefficientsA); } __syncwarp(); @@ -450,18 +526,65 @@ __launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__ #pragma unroll for (int i = 0; i < numIter; i++) { - int outputIndexLocal = i * iterThreads + threadLocalId; - int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal; - const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal]; - gm_forces[outputIndexGlobal] = outputForceComponent; + int outputIndexLocal = i * iterThreads + threadLocalId; + int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal; + float outputForceComponent = ((float*)sm_forces)[outputIndexLocal]; + gm_forces[outputIndexGlobal] = outputForceComponent; + } + } + + if (numGrids == 2) + { + /* We must sync here since the same shared memory is used as above. */ + __syncthreads(); + fx = 0.0f; + fy = 0.0f; + fz = 0.0f; + const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]); + if (chargeCheck) + { + sumForceComponents( + &fx, &fy, &fz, ithyMin, ithyMax, ixBase, iz, nx, ny, pny, pnz, atomIndexLocal, + splineIndexBase, tdz, sm_gridlineIndices, sm_theta, sm_dtheta, gm_gridB); + } + // Reduction of partial force contributions + reduce_atom_forces(sm_forces, atomIndexLocal, splineIndex, + lineIndex, kernelParams.grid.realGridSizeFP, + fx, fy, fz); + __syncthreads(); + + /* Calculating the final forces with no component branching, atomsPerBlock threads */ + if (forceIndexLocal < atomsPerBlock) + { + calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal, + kernelParams.current.recipBox, 1.0F - scale, gm_coefficientsB); + } + + __syncwarp(); + + /* Writing or adding the final forces component-wise, single warp */ + if (threadLocalId < iterThreads) + { +#pragma unroll + for (int i = 0; i < numIter; i++) + { + int outputIndexLocal = i * iterThreads + threadLocalId; + int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal; + float outputForceComponent = ((float*)sm_forces)[outputIndexLocal]; + gm_forces[outputIndexGlobal] += outputForceComponent; + } } } } //! Kernel instantiations // clang-format off -template __global__ void pme_gather_kernel<4, true, true, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); -template __global__ void pme_gather_kernel<4, true, true, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); -template __global__ void pme_gather_kernel<4, true, true, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); -template __global__ void pme_gather_kernel<4, true, true, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); -// clang-format on \ No newline at end of file +template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams); +template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); +template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams); +template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); +// clang-format on diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index a5f54f004d..bcac411467 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -125,7 +125,6 @@ void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t* pme, enum gmx_fft_direction dir, gmx_wallcycle_t wcycle) { - GMX_ASSERT(gridIndex == 0, "Only single grid supported"); if (pme_gpu_settings(pme->gpu).performGPUFFT) { wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); @@ -190,32 +189,41 @@ void pme_gpu_prepare_computation(gmx_pme_t* pme, } } -void pme_gpu_launch_spread(gmx_pme_t* pme, GpuEventSynchronizer* xReadyOnDevice, gmx_wallcycle* wcycle) +void pme_gpu_launch_spread(gmx_pme_t* pme, + GpuEventSynchronizer* xReadyOnDevice, + gmx_wallcycle* wcycle, + const real lambdaQ) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); GMX_ASSERT(xReadyOnDevice || !pme->bPPnode || (GMX_GPU != GMX_GPU_CUDA), "Need a valid xReadyOnDevice on PP+PME ranks with CUDA."); + GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU."); PmeGpu* pmeGpu = pme->gpu; - const unsigned int gridIndex = 0; - real* fftgrid = pme->fftgrid[gridIndex]; + GMX_ASSERT(pmeGpu->common->ngrids == 1 || (pmeGpu->common->ngrids == 2 && pme->bFEP_q), + "If not decoupling Coulomb interactions there should only be one FEP grid. If " + "decoupling Coulomb interactions there should be two grids."); + + /* PME on GPU can currently manage two grids: + * grid_index=0: Coulomb PME with charges in the normal state or from FEP state A. + * grid_index=1: Coulomb PME with charges from FEP state B. + */ + real** fftgrids = pme->fftgrid; /* Spread the coefficients on a grid */ const bool computeSplines = true; const bool spreadCharges = true; wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); - pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges); + pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); } void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, const gmx::StepWorkload& stepWork) { - PmeGpu* pmeGpu = pme->gpu; - const auto& settings = pmeGpu->settings; - const unsigned int gridIndex = 0; - t_complex* cfftgrid = pme->cfftgrid[gridIndex]; + PmeGpu* pmeGpu = pme->gpu; + const auto& settings = pmeGpu->settings; // There's no support for computing energy without virial, or vice versa const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial; if (!settings.performGPUFFT) @@ -227,37 +235,44 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, co try { - /* do R2C 3D-FFT */ - parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle); - - /* solve in k-space for our local cells */ - if (settings.performGPUSolve) - { - // TODO grid ordering should be set up at pme init time. - const auto gridOrdering = settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ; - wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); - wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); - pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial); - wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); - wallcycle_stop(wcycle, ewcLAUNCH_GPU); - } - else + /* The 3dffts and the solve are done in a loop to simplify things, even if this means that + * there will be two kernel launches for solve. */ + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) { - wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE); -#pragma omp parallel for num_threads(pme->nthread) schedule(static) - for (int thread = 0; thread < pme->nthread; thread++) + /* do R2C 3D-FFT */ + t_complex* cfftgrid = pme->cfftgrid[gridIndex]; + parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle); + + /* solve in k-space for our local cells */ + if (settings.performGPUSolve) { - solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, pme->nthread, thread); + const auto gridOrdering = + settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ; + wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); + wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); + pme_gpu_solve(pmeGpu, gridIndex, cfftgrid, gridOrdering, computeEnergyAndVirial); + wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); + wallcycle_stop(wcycle, ewcLAUNCH_GPU); + } + else + { + wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE); +#pragma omp parallel for num_threads(pme->nthread) schedule(static) + for (int thread = 0; thread < pme->nthread; thread++) + { + solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, + pme->nthread, thread); + } + wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE); } - wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE); - } - parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle); + parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle); + } } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; } -void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycle) +void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycle, const real lambdaQ) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); @@ -268,9 +283,9 @@ void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycl wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME); - const unsigned int gridIndex = 0; - real* fftgrid = pme->fftgrid[gridIndex]; - pme_gpu_gather(pme->gpu, reinterpret_cast(fftgrid)); + + float** fftgrids = pme->fftgrid; + pme_gpu_gather(pme->gpu, fftgrids, lambdaQ); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME); wallcycle_stop(wcycle, ewcLAUNCH_GPU); } @@ -303,6 +318,7 @@ static void pme_gpu_reduce_outputs(const bool computeEnergyAndVirial, GMX_ASSERT(enerd, "Invalid energy output manager"); forceWithVirial->addVirialContribution(output.coulombVirial_); enerd->term[F_COUL_RECIP] += output.coulombEnergy_; + enerd->dvdl_lin[efptCOUL] += output.coulombDvdl_; } if (output.haveForceOutput_) { @@ -316,6 +332,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t* pme, gmx_wallcycle* wcycle, gmx::ForceWithVirial* forceWithVirial, gmx_enerdata_t* enerd, + const real lambdaQ, GpuTaskCompletion completionKind) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); @@ -355,7 +372,8 @@ bool pme_gpu_try_finish_task(gmx_pme_t* pme, pme_gpu_update_timings(pme->gpu); // There's no support for computing energy without virial, or vice versa const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial; - PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial); + PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, + pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0); wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER); GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_, @@ -366,7 +384,10 @@ bool pme_gpu_try_finish_task(gmx_pme_t* pme, } // This is used by PME-only ranks -PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme, const bool computeEnergyAndVirial, gmx_wallcycle* wcycle) +PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme, + const bool computeEnergyAndVirial, + const real lambdaQ, + gmx_wallcycle* wcycle) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); @@ -380,7 +401,8 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme, const bool computeEnergyAndVi pme_gpu_synchronize(pme->gpu); } - PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial); + PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, + pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0); wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER); return output; } @@ -390,11 +412,13 @@ void pme_gpu_wait_and_reduce(gmx_pme_t* pme, const gmx::StepWorkload& stepWork, gmx_wallcycle* wcycle, gmx::ForceWithVirial* forceWithVirial, - gmx_enerdata_t* enerd) + gmx_enerdata_t* enerd, + const real lambdaQ) { // There's no support for computing energy without virial, or vice versa const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial; - PmeOutput output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, wcycle); + PmeOutput output = pme_gpu_wait_finish_task( + pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0, wcycle); GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_, "When forces are reduced on the CPU, there needs to be force output"); pme_gpu_reduce_outputs(computeEnergyAndVirial, output, wcycle, forceWithVirial, enerd); diff --git a/src/gromacs/ewald/pme_gpu_3dfft.cu b/src/gromacs/ewald/pme_gpu_3dfft.cu index 9f9578962e..ec928f745a 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft.cu +++ b/src/gromacs/ewald/pme_gpu_3dfft.cu @@ -60,7 +60,7 @@ static void handleCufftError(cufftResult_t status, const char* msg) } } -GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu) +GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex) { const PmeGpuCudaKernelParams* kernelParamsPtr = pmeGpu->kernelParams.get(); ivec realGridSize, realGridSizePadded, complexGridSizePadded; @@ -79,9 +79,9 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu) const int realGridSizePaddedTotal = realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ]; - realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid; + realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid[gridIndex]; GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid"); - complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid; + complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid[gridIndex]; GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid"); cufftResult_t result; diff --git a/src/gromacs/ewald/pme_gpu_3dfft.h b/src/gromacs/ewald/pme_gpu_3dfft.h index fc6d67a935..96cfa9ed6a 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft.h +++ b/src/gromacs/ewald/pme_gpu_3dfft.h @@ -73,8 +73,9 @@ public: * Constructs CUDA/OpenCL FFT plans for performing 3D FFT on a PME grid. * * \param[in] pmeGpu The PME GPU structure. + * \param[in] gridIndex The index of the grid on which to perform the calculations. */ - GpuParallel3dFft(const PmeGpu* pmeGpu); + GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex); /*! \brief Destroys the FFT plans. */ ~GpuParallel3dFft(); /*! \brief Performs the FFT transform in given direction diff --git a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp index b341a27b82..4ba6649497 100644 --- a/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp +++ b/src/gromacs/ewald/pme_gpu_3dfft_ocl.cpp @@ -63,7 +63,7 @@ static void handleClfftError(clfftStatus status, const char* msg) } } -GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu) +GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex) { // Extracting all the data from PME GPU std::array realGridSize, realGridSizePadded, complexGridSizePadded; @@ -82,8 +82,8 @@ GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu) } cl_context context = pmeGpu->archSpecific->deviceContext_.context(); deviceStreams_.push_back(pmeGpu->archSpecific->pmeStream_.stream()); - realGrid_ = kernelParamsPtr->grid.d_realGrid; - complexGrid_ = kernelParamsPtr->grid.d_fourierGrid; + realGrid_ = kernelParamsPtr->grid.d_realGrid[gridIndex]; + complexGrid_ = kernelParamsPtr->grid.d_fourierGrid[gridIndex]; const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT; diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh index bfdc3fc0e7..05649b600a 100644 --- a/src/gromacs/ewald/pme_gpu_calculate_splines.cuh +++ b/src/gromacs/ewald/pme_gpu_calculate_splines.cuh @@ -44,6 +44,7 @@ #include #include "gromacs/gpu_utils/cuda_kernel_utils.cuh" +#include "gromacs/gpu_utils/vectype_ops.cuh" #include "pme.cuh" #include "pme_grid.h" @@ -139,13 +140,13 @@ __device__ inline void assertIsFinite(T arg) /*! \brief * General purpose function for loading atom-related data from global to shared memory. * - * \tparam[in] T Data type (float/int/...) - * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be - * accounted for in the size of the shared memory array. - * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for - * an rvec coordinates array). - * \param[out] sm_destination Shared memory array for output. - * \param[in] gm_source Global memory array for input. + * \tparam[in] T Data type (float/int/...) + * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for in + * the size of the shared memory array. + * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec + * coordinates array). + * \param[out] sm_destination Shared memory array for output. + * \param[in] gm_source Global memory array for input. */ template __device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_destination, diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 63b77aa86f..635f0f1a62 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -133,26 +133,46 @@ void pme_gpu_synchronize(const PmeGpu* pmeGpu) void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu) { const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float); - allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount, - pmeGpu->archSpecific->deviceContext_); - pmalloc(reinterpret_cast(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize); + + GMX_ASSERT( + pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2, + "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU"); + + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex], + c_virialAndEnergyCount, pmeGpu->archSpecific->deviceContext_); + pmalloc(reinterpret_cast(&pmeGpu->staging.h_virialAndEnergy[gridIndex]), energyAndVirialSize); + } } void pme_gpu_free_energy_virial(PmeGpu* pmeGpu) { - freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy); - pfree(pmeGpu->staging.h_virialAndEnergy); - pmeGpu->staging.h_virialAndEnergy = nullptr; + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex]); + pfree(pmeGpu->staging.h_virialAndEnergy[gridIndex]); + pmeGpu->staging.h_virialAndEnergy[gridIndex] = nullptr; + } } void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu) { - clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0, - c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_); + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex], 0, + c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_); + } } -void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu) +void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, const int gridIndex) { + GMX_ASSERT( + pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2, + "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU"); + GMX_ASSERT(gridIndex < pmeGpu->common->ngrids, + "Invalid combination of gridIndex and number of grids"); + const int splineValuesOffset[DIM] = { 0, pmeGpu->kernelParams->grid.realGridSize[XX], pmeGpu->kernelParams->grid.realGridSize[XX] + pmeGpu->kernelParams->grid.realGridSize[YY] }; @@ -161,33 +181,36 @@ void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu) const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX] + pmeGpu->kernelParams->grid.realGridSize[YY] + pmeGpu->kernelParams->grid.realGridSize[ZZ]; - const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize); - reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize, - &pmeGpu->archSpecific->splineValuesSize, - &pmeGpu->archSpecific->splineValuesSizeAlloc, + const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize[gridIndex]); + reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex], + newSplineValuesSize, &pmeGpu->archSpecific->splineValuesSize[gridIndex], + &pmeGpu->archSpecific->splineValuesCapacity[gridIndex], pmeGpu->archSpecific->deviceContext_); if (shouldRealloc) { /* Reallocate the host buffer */ - pfree(pmeGpu->staging.h_splineModuli); - pmalloc(reinterpret_cast(&pmeGpu->staging.h_splineModuli), + pfree(pmeGpu->staging.h_splineModuli[gridIndex]); + pmalloc(reinterpret_cast(&pmeGpu->staging.h_splineModuli[gridIndex]), newSplineValuesSize * sizeof(float)); } for (int i = 0; i < DIM; i++) { - memcpy(pmeGpu->staging.h_splineModuli + splineValuesOffset[i], + memcpy(pmeGpu->staging.h_splineModuli[gridIndex] + splineValuesOffset[i], pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float)); } /* TODO: pin original buffer instead! */ - copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli, - 0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream_, - pmeGpu->settings.transferKind, nullptr); + copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex], + pmeGpu->staging.h_splineModuli[gridIndex], 0, newSplineValuesSize, + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu) { - pfree(pmeGpu->staging.h_splineModuli); - freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli); + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + pfree(pmeGpu->staging.h_splineModuli[gridIndex]); + freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex]); + } } void pme_gpu_realloc_forces(PmeGpu* pmeGpu) @@ -224,16 +247,18 @@ void pme_gpu_copy_output_forces(PmeGpu* pmeGpu) pmeGpu->settings.transferKind, nullptr); } -void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients) +void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu, + const float* h_coefficients, + const int gridIndex) { GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU"); const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc; GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU"); - reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize, - &pmeGpu->archSpecific->coefficientsSize, - &pmeGpu->archSpecific->coefficientsSizeAlloc, + reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex], + newCoefficientsSize, &pmeGpu->archSpecific->coefficientsSize[gridIndex], + &pmeGpu->archSpecific->coefficientsCapacity[gridIndex], pmeGpu->archSpecific->deviceContext_); - copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, + copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex], const_cast(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms, pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); @@ -241,14 +266,17 @@ void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_ const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex; if (paddingCount > 0) { - clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex, + clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex], paddingIndex, paddingCount, pmeGpu->archSpecific->pmeStream_); } } void pme_gpu_free_coefficients(const PmeGpu* pmeGpu) { - freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients); + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex]); + } } void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu) @@ -304,51 +332,65 @@ void pme_gpu_free_grid_indices(const PmeGpu* pmeGpu) void pme_gpu_realloc_grids(PmeGpu* pmeGpu) { - auto* kernelParamsPtr = pmeGpu->kernelParams.get(); + auto* kernelParamsPtr = pmeGpu->kernelParams.get(); + const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX] * kernelParamsPtr->grid.realGridSizePadded[YY] * kernelParamsPtr->grid.realGridSizePadded[ZZ]; const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX] * kernelParamsPtr->grid.complexGridSizePadded[YY] * kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2; - // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float - if (pmeGpu->archSpecific->performOutOfPlaceFFT) - { - /* 2 separate grids */ - reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize, - &pmeGpu->archSpecific->complexGridSize, - &pmeGpu->archSpecific->complexGridSizeAlloc, - pmeGpu->archSpecific->deviceContext_); - reallocateDeviceBuffer( - &kernelParamsPtr->grid.d_realGrid, newRealGridSize, &pmeGpu->archSpecific->realGridSize, - &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_); - } - else + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) { - /* A single buffer so that any grid will fit */ - const int newGridsSize = std::max(newRealGridSize, newComplexGridSize); - reallocateDeviceBuffer( - &kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize, - &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_); - kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid; - pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize; - // the size might get used later for copying the grid + // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float + if (pmeGpu->archSpecific->performOutOfPlaceFFT) + { + /* 2 separate grids */ + reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], newComplexGridSize, + &pmeGpu->archSpecific->complexGridSize[gridIndex], + &pmeGpu->archSpecific->complexGridCapacity[gridIndex], + pmeGpu->archSpecific->deviceContext_); + reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newRealGridSize, + &pmeGpu->archSpecific->realGridSize[gridIndex], + &pmeGpu->archSpecific->realGridCapacity[gridIndex], + pmeGpu->archSpecific->deviceContext_); + } + else + { + /* A single buffer so that any grid will fit */ + const int newGridsSize = std::max(newRealGridSize, newComplexGridSize); + reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newGridsSize, + &pmeGpu->archSpecific->realGridSize[gridIndex], + &pmeGpu->archSpecific->realGridCapacity[gridIndex], + pmeGpu->archSpecific->deviceContext_); + kernelParamsPtr->grid.d_fourierGrid[gridIndex] = kernelParamsPtr->grid.d_realGrid[gridIndex]; + pmeGpu->archSpecific->complexGridSize[gridIndex] = + pmeGpu->archSpecific->realGridSize[gridIndex]; + // the size might get used later for copying the grid + } } } void pme_gpu_free_grids(const PmeGpu* pmeGpu) { - if (pmeGpu->archSpecific->performOutOfPlaceFFT) + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) { - freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid); + if (pmeGpu->archSpecific->performOutOfPlaceFFT) + { + freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid[gridIndex]); + } + freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex]); } - freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid); } void pme_gpu_clear_grids(const PmeGpu* pmeGpu) { - clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0, - pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_); + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0, + pmeGpu->archSpecific->realGridSize[gridIndex], + pmeGpu->archSpecific->pmeStream_); + } } void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu) @@ -395,17 +437,18 @@ bool pme_gpu_stream_query(const PmeGpu* pmeGpu) return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream_); } -void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid) +void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, const int gridIndex) { - copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize, + copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], h_grid, 0, + pmeGpu->archSpecific->realGridSize[gridIndex], pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } -void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid) +void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, const int gridIndex) { - copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0, - pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_, - pmeGpu->settings.transferKind, nullptr); + copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0, + pmeGpu->archSpecific->realGridSize[gridIndex], + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_); } @@ -488,9 +531,9 @@ void pme_gpu_reinit_3dfft(const PmeGpu* pmeGpu) if (pme_gpu_settings(pmeGpu).performGPUFFT) { pmeGpu->archSpecific->fftSetup.resize(0); - for (int i = 0; i < pmeGpu->common->ngrids; i++) + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) { - pmeGpu->archSpecific->fftSetup.push_back(std::make_unique(pmeGpu)); + pmeGpu->archSpecific->fftSetup.push_back(std::make_unique(pmeGpu, gridIndex)); } } } @@ -500,26 +543,62 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu) pmeGpu->archSpecific->fftSetup.resize(0); } -void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, PmeOutput* output) +void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, const float lambda, PmeOutput* output) { const PmeGpu* pmeGpu = pme.gpu; - for (int j = 0; j < c_virialAndEnergyCount; j++) - { - GMX_ASSERT(std::isfinite(pmeGpu->staging.h_virialAndEnergy[j]), - "PME GPU produces incorrect energy/virial."); - } - - unsigned int j = 0; - output->coulombVirial_[XX][XX] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++]; - output->coulombVirial_[YY][YY] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++]; - output->coulombVirial_[ZZ][ZZ] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++]; - output->coulombVirial_[XX][YY] = output->coulombVirial_[YY][XX] = - 0.25F * pmeGpu->staging.h_virialAndEnergy[j++]; - output->coulombVirial_[XX][ZZ] = output->coulombVirial_[ZZ][XX] = - 0.25F * pmeGpu->staging.h_virialAndEnergy[j++]; - output->coulombVirial_[YY][ZZ] = output->coulombVirial_[ZZ][YY] = - 0.25F * pmeGpu->staging.h_virialAndEnergy[j++]; - output->coulombEnergy_ = 0.5F * pmeGpu->staging.h_virialAndEnergy[j++]; + + GMX_ASSERT(lambda == 1.0 || pmeGpu->common->ngrids == 2, + "Invalid combination of lambda and number of grids"); + + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + for (int j = 0; j < c_virialAndEnergyCount; j++) + { + GMX_ASSERT(std::isfinite(pmeGpu->staging.h_virialAndEnergy[gridIndex][j]), + "PME GPU produces incorrect energy/virial."); + } + } + for (int dim1 = 0; dim1 < DIM; dim1++) + { + for (int dim2 = 0; dim2 < DIM; dim2++) + { + output->coulombVirial_[dim1][dim2] = 0; + } + } + output->coulombEnergy_ = 0; + float scale = 1.0; + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + if (pmeGpu->common->ngrids == 2) + { + scale = gridIndex == 0 ? (1.0 - lambda) : lambda; + } + output->coulombVirial_[XX][XX] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][0]; + output->coulombVirial_[YY][YY] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][1]; + output->coulombVirial_[ZZ][ZZ] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][2]; + output->coulombVirial_[XX][YY] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][3]; + output->coulombVirial_[YY][XX] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][3]; + output->coulombVirial_[XX][ZZ] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][4]; + output->coulombVirial_[ZZ][XX] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][4]; + output->coulombVirial_[YY][ZZ] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][5]; + output->coulombVirial_[ZZ][YY] += + scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][5]; + output->coulombEnergy_ += scale * 0.5F * pmeGpu->staging.h_virialAndEnergy[gridIndex][6]; + } + if (pmeGpu->common->ngrids > 1) + { + output->coulombDvdl_ = 0.5F + * (pmeGpu->staging.h_virialAndEnergy[FEP_STATE_B][6] + - pmeGpu->staging.h_virialAndEnergy[FEP_STATE_A][6]); + } } /*! \brief Sets the force-related members in \p output @@ -536,7 +615,7 @@ static void pme_gpu_getForceOutput(PmeGpu* pmeGpu, PmeOutput* output) } } -PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const bool computeEnergyAndVirial) +PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const bool computeEnergyAndVirial, const real lambdaQ) { PmeGpu* pmeGpu = pme.gpu; @@ -548,7 +627,7 @@ PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const bool computeEnergyAndVir { if (pme_gpu_settings(pmeGpu).performGPUSolve) { - pme_gpu_getEnergyAndVirial(pme, &output); + pme_gpu_getEnergyAndVirial(pme, lambdaQ, &output); } else { @@ -590,9 +669,13 @@ void pme_gpu_update_input_box(PmeGpu gmx_unused* pmeGpu, const matrix gmx_unused static void pme_gpu_reinit_grids(PmeGpu* pmeGpu) { auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu); + + GMX_ASSERT( + pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2, + "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU"); + kernelParamsPtr->grid.ewaldFactor = (M_PI * M_PI) / (pmeGpu->common->ewaldcoeff_q * pmeGpu->common->ewaldcoeff_q); - /* The grid size variants */ for (int i = 0; i < DIM; i++) { @@ -613,14 +696,17 @@ static void pme_gpu_reinit_grids(PmeGpu* pmeGpu) kernelParamsPtr->grid.realGridSizePadded[ZZ] = (kernelParamsPtr->grid.realGridSize[ZZ] / 2 + 1) * 2; } - /* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */ kernelParamsPtr->grid.complexGridSize[ZZ] /= 2; kernelParamsPtr->grid.complexGridSize[ZZ]++; kernelParamsPtr->grid.complexGridSizePadded[ZZ] = kernelParamsPtr->grid.complexGridSize[ZZ]; pme_gpu_realloc_and_copy_fract_shifts(pmeGpu); - pme_gpu_realloc_and_copy_bspline_values(pmeGpu); + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + pme_gpu_realloc_and_copy_bspline_values(pmeGpu, gridIndex); + } + pme_gpu_realloc_grids(pmeGpu); pme_gpu_reinit_3dfft(pmeGpu); } @@ -641,7 +727,7 @@ static void pme_gpu_copy_common_data_from(const gmx_pme_t* pme) /* TODO: Consider refactoring the CPU PME code to use the same structure, * so that this function becomes 2 lines */ PmeGpu* pmeGpu = pme->gpu; - pmeGpu->common->ngrids = pme->ngrids; + pmeGpu->common->ngrids = pme->bFEP_q ? 2 : 1; pmeGpu->common->epsilon_r = pme->epsilon_r; pmeGpu->common->ewaldcoeff_q = pme->ewaldcoeff_q; pmeGpu->common->nk[XX] = pme->nkx; @@ -725,9 +811,9 @@ static void pme_gpu_init(gmx_pme_t* pme, pmeGpu->initializedClfftLibrary_ = std::make_unique(); pme_gpu_init_internal(pmeGpu, deviceContext, deviceStream); - pme_gpu_alloc_energy_virial(pmeGpu); pme_gpu_copy_common_data_from(pme); + pme_gpu_alloc_energy_virial(pmeGpu); GMX_ASSERT(pmeGpu->common->epsilon_r != 0.0F, "PME GPU: bad electrostatic coefficient"); @@ -804,7 +890,7 @@ void pme_gpu_destroy(PmeGpu* pmeGpu) delete pmeGpu; } -void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges) +void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* chargesA, const real* chargesB) { auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu); kernelParamsPtr->atoms.nAtoms = nAtoms; @@ -817,8 +903,25 @@ void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges) GMX_RELEASE_ASSERT(false, "Only single precision supported"); GMX_UNUSED_VALUE(charges); #else - pme_gpu_realloc_and_copy_input_coefficients(pmeGpu, reinterpret_cast(charges)); + int gridIndex = 0; /* Could also be checked for haveToRealloc, but the copy always needs to be performed */ + pme_gpu_realloc_and_copy_input_coefficients(pmeGpu, reinterpret_cast(chargesA), gridIndex); + gridIndex++; + if (chargesB != nullptr) + { + pme_gpu_realloc_and_copy_input_coefficients( + pmeGpu, reinterpret_cast(chargesB), gridIndex); + } + else + { + /* Fill the second set of coefficients with chargesA as well to be able to avoid + * conditionals in the GPU kernels */ + /* FIXME: This should be avoided by making a separate templated version of the + * relevant kernel(s) (probably only pme_gather_kernel). That would require a + * reduction of the current number of templated parameters of that kernel. */ + pme_gpu_realloc_and_copy_input_coefficients( + pmeGpu, reinterpret_cast(chargesA), gridIndex); + } #endif if (haveToRealloc) @@ -849,7 +952,7 @@ static CommandEvent* pme_gpu_fetch_timing_event(const PmeGpu* pmeGpu, size_t PME return timingEvent; } -void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, int grid_index) +void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, const int grid_index) { int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R; @@ -881,32 +984,64 @@ std::pair inline pmeGpuCreateGrid(const PmeGpu* pmeGpu, int blockCount * \param[in] pmeGpu The PME GPU structure. * \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory + * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed. * * \return Pointer to CUDA kernel */ -static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool writeSplinesToGlobal) +static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu, + ThreadsPerAtom threadsPerAtom, + bool writeSplinesToGlobal, + const int numGrids) { PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; if (writeSplinesToGlobal) { if (threadsPerAtom == ThreadsPerAtom::Order) { - kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4Dual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4Single; + } } else { - kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplines; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesDual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesSingle; + } } } else { if (threadsPerAtom == ThreadsPerAtom::Order) { - kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Dual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Single; + } } else { - kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelDual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelSingle; + } } } @@ -919,12 +1054,14 @@ static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom * \param[in] pmeGpu The PME GPU structure. * \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory + * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed. * * \return Pointer to CUDA kernel */ static auto selectSplineKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, - bool gmx_unused writeSplinesToGlobal) + bool gmx_unused writeSplinesToGlobal, + const int numGrids) { PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; GMX_ASSERT( @@ -933,11 +1070,25 @@ static auto selectSplineKernelPtr(const PmeGpu* pmeGpu, if (threadsPerAtom == ThreadsPerAtom::Order) { - kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4Dual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4Single; + } } else { - kernelPtr = pmeGpu->programHandle_->impl_->splineKernel; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->splineKernelDual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->splineKernelSingle; + } } return kernelPtr; } @@ -948,21 +1099,38 @@ static auto selectSplineKernelPtr(const PmeGpu* pmeGpu, * \param[in] pmeGpu The PME GPU structure. * \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom * \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory + * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed. * * \return Pointer to CUDA kernel */ -static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool writeSplinesToGlobal) +static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, + ThreadsPerAtom threadsPerAtom, + bool writeSplinesToGlobal, + const int numGrids) { PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; if (writeSplinesToGlobal) { if (threadsPerAtom == ThreadsPerAtom::Order) { - kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Dual; + } + { + kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Single; + } } else { - kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelDual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelSingle; + } } } else @@ -971,11 +1139,25 @@ static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPe using the spline and spread Kernel */ if (threadsPerAtom == ThreadsPerAtom::Order) { - kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Dual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Single; + } } else { - kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelDual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelSingle; + } } } return kernelPtr; @@ -983,14 +1165,18 @@ static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPe void pme_gpu_spread(const PmeGpu* pmeGpu, GpuEventSynchronizer* xReadyOnDevice, - int gmx_unused gridIndex, - real* h_grid, - bool computeSplines, - bool spreadCharges) + real** h_grids, + bool computeSplines, + bool spreadCharges, + const real lambda) { + GMX_ASSERT( + pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2, + "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU"); + GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)"); - const auto* kernelParamsPtr = pmeGpu->kernelParams.get(); + auto* kernelParamsPtr = pmeGpu->kernelParams.get(); GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread"); const size_t blockSize = pmeGpu->programHandle_->impl_->spreadWorkGroupSize; @@ -1031,6 +1217,15 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock; auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); + if (pmeGpu->common->ngrids == 1) + { + kernelParamsPtr->current.scale = 1.0; + } + else + { + kernelParamsPtr->current.scale = 1.0 - lambda; + } + KernelLaunchConfig config; config.blockSize[0] = order; config.blockSize[1] = (pmeGpu->settings.threadsPerAtom == ThreadsPerAtom::Order ? 1 : order); @@ -1046,20 +1241,22 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, { timingId = gtPME_SPLINEANDSPREAD; kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines)); + writeGlobal || (!recalculateSplines), + pmeGpu->common->ngrids); } else { timingId = gtPME_SPLINE; kernelPtr = selectSplineKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines)); + writeGlobal || (!recalculateSplines), + pmeGpu->common->ngrids); } } else { timingId = gtPME_SPREAD; kernelPtr = selectSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines)); + writeGlobal || (!recalculateSplines), pmeGpu->common->ngrids); } @@ -1071,9 +1268,10 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, const auto kernelArgs = prepareGpuKernelArguments( kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_theta, &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices, - &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->grid.d_fractShiftsTable, - &kernelParamsPtr->grid.d_gridlineIndicesTable, &kernelParamsPtr->atoms.d_coefficients, - &kernelParamsPtr->atoms.d_coordinates); + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B], + &kernelParamsPtr->grid.d_fractShiftsTable, &kernelParamsPtr->grid.d_gridlineIndicesTable, + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A], + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], &kernelParamsPtr->atoms.d_coordinates); #endif launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, @@ -1084,7 +1282,11 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, const bool copyBackGrid = spreadCharges && (!settings.performGPUFFT || settings.copyAllOutputs); if (copyBackGrid) { - pme_gpu_copy_output_spread_grid(pmeGpu, h_grid); + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + float* h_grid = h_grids[gridIndex]; + pme_gpu_copy_output_spread_grid(pmeGpu, h_grid, gridIndex); + } } const bool copyBackAtomData = computeSplines && (!settings.performGPUGather || settings.copyAllOutputs); @@ -1094,8 +1296,18 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, } } -void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial) +void pme_gpu_solve(const PmeGpu* pmeGpu, + const int gridIndex, + t_complex* h_grid, + GridOrdering gridOrdering, + bool computeEnergyAndVirial) { + GMX_ASSERT( + pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2, + "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU"); + GMX_ASSERT(gridIndex < pmeGpu->common->ngrids, + "Invalid combination of gridIndex and number of grids"); + const auto& settings = pmeGpu->settings; const bool copyInputAndOutputGrid = !settings.performGPUFFT || settings.copyAllOutputs; @@ -1104,9 +1316,9 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd float* h_gridFloat = reinterpret_cast(h_grid); if (copyInputAndOutputGrid) { - copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0, - pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_, - pmeGpu->settings.transferKind, nullptr); + copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], h_gridFloat, 0, + pmeGpu->archSpecific->complexGridSize[gridIndex], + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } int majorDim = -1, middleDim = -1, minorDim = -1; @@ -1160,13 +1372,29 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; if (gridOrdering == GridOrdering::YZX) { - kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernel - : pmeGpu->programHandle_->impl_->solveYZXKernel; + if (gridIndex == 0) + { + kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernelA + : pmeGpu->programHandle_->impl_->solveYZXKernelA; + } + else + { + kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernelB + : pmeGpu->programHandle_->impl_->solveYZXKernelB; + } } else if (gridOrdering == GridOrdering::XYZ) { - kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernel - : pmeGpu->programHandle_->impl_->solveXYZKernel; + if (gridIndex == 0) + { + kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernelA + : pmeGpu->programHandle_->impl_->solveXYZKernelA; + } + else + { + kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernelB + : pmeGpu->programHandle_->impl_->solveXYZKernelB; + } } pme_gpu_start_timing(pmeGpu, timingId); @@ -1175,8 +1403,9 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); #else const auto kernelArgs = prepareGpuKernelArguments( - kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli, - &kernelParamsPtr->constants.d_virialAndEnergy, &kernelParamsPtr->grid.d_fourierGrid); + kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli[gridIndex], + &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex], + &kernelParamsPtr->grid.d_fourierGrid[gridIndex]); #endif launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve", kernelArgs); @@ -1184,16 +1413,17 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd if (computeEnergyAndVirial) { - copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy, - &kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount, - pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); + copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy[gridIndex], + &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex], 0, + c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_, + pmeGpu->settings.transferKind, nullptr); } if (copyInputAndOutputGrid) { - copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0, - pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_, - pmeGpu->settings.transferKind, nullptr); + copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid[gridIndex], 0, + pmeGpu->archSpecific->complexGridSize[gridIndex], + pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr); } } @@ -1203,10 +1433,14 @@ void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrd * \param[in] pmeGpu The PME GPU structure. * \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom * \param[in] readSplinesFromGlobal bool controlling if we should write spline data to global memory + * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed. * * \return Pointer to CUDA kernel */ -inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool readSplinesFromGlobal) +inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, + ThreadsPerAtom threadsPerAtom, + bool readSplinesFromGlobal, + const int numGrids) { PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; @@ -1215,34 +1449,70 @@ inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPe { if (threadsPerAtom == ThreadsPerAtom::Order) { - kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4Dual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4Single; + } } else { - kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplines; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesDual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesSingle; + } } } else { if (threadsPerAtom == ThreadsPerAtom::Order) { - kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4Dual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4Single; + } } else { - kernelPtr = pmeGpu->programHandle_->impl_->gatherKernel; + if (numGrids == 2) + { + kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelDual; + } + else + { + kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelSingle; + } } } return kernelPtr; } - -void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid) +void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda) { + GMX_ASSERT( + pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2, + "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU"); + const auto& settings = pmeGpu->settings; + if (!settings.performGPUFFT || settings.copyAllOutputs) { - pme_gpu_copy_input_gather_grid(pmeGpu, const_cast(h_grid)); + for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++) + { + float* h_grid = const_cast(h_grids[gridIndex]); + pme_gpu_copy_input_gather_grid(pmeGpu, h_grid, gridIndex); + } } if (settings.copyAllOutputs) @@ -1280,22 +1550,33 @@ void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid) // TODO test different cache configs - int timingId = gtPME_GATHER; - PmeGpuProgramImpl::PmeKernelHandle kernelPtr = selectGatherKernelPtr( - pmeGpu, pmeGpu->settings.threadsPerAtom, readGlobal || (!recalculateSplines)); + int timingId = gtPME_GATHER; + PmeGpuProgramImpl::PmeKernelHandle kernelPtr = + selectGatherKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom, + readGlobal || (!recalculateSplines), pmeGpu->common->ngrids); // TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl pme_gpu_start_timing(pmeGpu, timingId); - auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); - const auto* kernelParamsPtr = pmeGpu->kernelParams.get(); + auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); + auto* kernelParamsPtr = pmeGpu->kernelParams.get(); + if (pmeGpu->common->ngrids == 1) + { + kernelParamsPtr->current.scale = 1.0; + } + else + { + kernelParamsPtr->current.scale = 1.0 - lambda; + } + #if c_canEmbedBuffers const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); #else const auto kernelArgs = prepareGpuKernelArguments( - kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients, - &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->atoms.d_theta, - &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices, - &kernelParamsPtr->atoms.d_forces); + kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A], + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B], + &kernelParamsPtr->atoms.d_theta, &kernelParamsPtr->atoms.d_dtheta, + &kernelParamsPtr->atoms.d_gridlineIndices, &kernelParamsPtr->atoms.d_forces); #endif launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather", kernelArgs); diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index f2cbc94c5d..1220b13984 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -69,6 +69,15 @@ struct PmeGpuStaging; struct PmeGpuSettings; struct t_complex; +#ifndef FEP_STATE_A +//! Grid index of FEP state A (or unperturbed system) +# define FEP_STATE_A 0 +#endif +#ifndef FEP_STATE_B +//! Grid index of FEP state B +# define FEP_STATE_B 1 +#endif + namespace gmx { template @@ -134,8 +143,10 @@ void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu); * Reallocates and copies the pre-computed B-spline values to the GPU. * * \param[in,out] pmeGpu The PME GPU structure. + * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal + * state or FEP state A and 1 is Coulomb in FEP state B. */ -void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu); +void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, int gridIndex = 0); /*! \libinternal \brief * Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers). @@ -188,11 +199,15 @@ bool pme_gpu_stream_query(const PmeGpu* pmeGpu); * * \param[in] pmeGpu The PME GPU structure. * \param[in] h_coefficients The input atom charges/coefficients. + * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal + * state or FEP state A and 1 is Coulomb in FEP state B. * * Does not need to be done for every PME computation, only whenever the local charges change. * (So, in the beginning of the run, or on DD step). */ -void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients); +void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu, + const float* h_coefficients, + int gridIndex = 0); /*! \libinternal \brief * Frees the charges/coefficients on the GPU. @@ -268,30 +283,34 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu); /*! \libinternal \brief * Copies the input real-space grid from the host to the GPU. * - * \param[in] pmeGpu The PME GPU structure. - * \param[in] h_grid The host-side grid buffer. + * \param[in] pmeGpu The PME GPU structure. + * \param[in] h_grid The host-side grid buffer. + * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal + * state or FEP state A and 1 is Coulomb in FEP state B. */ -void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid); +void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, int gridIndex = 0); /*! \libinternal \brief * Copies the output real-space grid from the GPU to the host. * - * \param[in] pmeGpu The PME GPU structure. - * \param[out] h_grid The host-side grid buffer. + * \param[in] pmeGpu The PME GPU structure. + * \param[out] h_grid The host-side grid buffer. + * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal + * state or FEP state A and 1 is Coulomb in FEP state B. */ -void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid); +void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, int gridIndex = 0); /*! \libinternal \brief * Copies the spread output spline data and gridline indices from the GPU to the host. * - * \param[in] pmeGpu The PME GPU structure. + * \param[in] pmeGpu The PME GPU structure. */ void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu); /*! \libinternal \brief * Copies the gather input spline data and gridline indices from the host to the GPU. * - * \param[in] pmeGpu The PME GPU structure. + * \param[in] pmeGpu The PME GPU structure. */ void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu); @@ -321,40 +340,44 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu); /*! \libinternal \brief * A GPU spline computation and charge spreading function. * - * \param[in] pmeGpu The PME GPU structure. - * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory; - * can be nullptr when invoked on a separate PME rank or from PME tests. - * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0. - * \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host, - * e.g. testing or host-side FFT) - * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed. - * \param[in] spreadCharges Should the charges/coefficients be spread on the grid. + * \param[in] pmeGpu The PME GPU structure. + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory; + * can be nullptr when invoked on a separate PME rank or from PME tests. + * \param[out] h_grids The host-side grid buffers (used only if the result of the spread is expected on the host, + * e.g. testing or host-side FFT) + * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed. + * \param[in] spreadCharges Should the charges/coefficients be spread on the grid. + * \param[in] lambda The lambda value of the current system state. */ GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice), - int GPU_FUNC_ARGUMENT(gridIndex), - real* GPU_FUNC_ARGUMENT(h_grid), + float** GPU_FUNC_ARGUMENT(h_grids), bool GPU_FUNC_ARGUMENT(computeSplines), - bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM; + bool GPU_FUNC_ARGUMENT(spreadCharges), + const real GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM; /*! \libinternal \brief * 3D FFT R2C/C2R routine. * * \param[in] pmeGpu The PME GPU structure. * \param[in] direction Transform direction (real-to-complex or complex-to-real) - * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0. + * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal + * state or FEP state A and 1 is Coulomb in FEP state B. */ -void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex); +void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex = 0); /*! \libinternal \brief * A GPU Fourier space solving function. * * \param[in] pmeGpu The PME GPU structure. + * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal + * state or FEP state A and 1 is Coulomb in FEP state B. * \param[in,out] h_grid The host-side input and output Fourier grid buffer (used only with testing or host-side FFT) * \param[in] gridOrdering Specifies the dimenion ordering of the complex grid. TODO: store this information? * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should be performed. */ GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), + int GPU_FUNC_ARGUMENT(gridIndex), t_complex* GPU_FUNC_ARGUMENT(h_grid), GridOrdering GPU_FUNC_ARGUMENT(gridOrdering), bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) GPU_FUNC_TERM; @@ -362,11 +385,14 @@ GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), /*! \libinternal \brief * A GPU force gathering function. * - * \param[in] pmeGpu The PME GPU structure. - * reductions. \param[in] h_grid The host-side grid buffer (used only in testing mode) + * \param[in] pmeGpu The PME GPU structure. + * \param[in] h_grids The host-side grid buffer (used only in testing mode). + * \param[in] lambda The lambda value to use. */ -GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), - const float* GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM; +GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), + float** GPU_FUNC_ARGUMENT(h_grids), + float GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM; + /*! \brief Sets the device pointer to coordinate data * \param[in] pmeGpu The PME GPU structure. @@ -435,9 +461,11 @@ inline void pme_gpu_set_testing(PmeGpu* pmeGpu, bool testing) * handled the solve stage. * * \param[in] pme The PME structure. + * \param[in] lambda The lambda value to use when calculating the results. * \param[out] output Pointer to output where energy and virial should be stored. */ GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme), + float GPU_FUNC_ARGUMENT(lambda), PmeOutput* GPU_FUNC_ARGUMENT(output)) GPU_FUNC_TERM; /*! \libinternal \brief @@ -445,10 +473,12 @@ GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARG * * \param[in] pme The PME structure. * \param[in] computeEnergyAndVirial Whether the energy and virial are being computed + * \param[in] lambdaQ The Coulomb lambda to use when finalizing the output. * \returns The output object. */ GPU_FUNC_QUALIFIER PmeOutput pme_gpu_getOutput(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme), - bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) + bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial), + real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM_WITH_RETURN(PmeOutput{}); /*! \libinternal \brief @@ -510,14 +540,16 @@ GPU_FUNC_QUALIFIER void pme_gpu_destroy(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu)) GPU_F * * \param[in] pmeGpu The PME GPU structure. * \param[in] nAtoms The number of particles. - * \param[in] charges The pointer to the host-side array of particle charges. + * \param[in] chargesA The pointer to the host-side array of particle charges in the unperturbed state or FEP state A. + * \param[in] chargesB The pointer to the host-side array of particle charges in FEP state B. * * This is a function that should only be called in the beginning of the run and on domain * decomposition. Should be called before the pme_gpu_set_io_ranges. */ GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), int GPU_FUNC_ARGUMENT(nAtoms), - const real* GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM; + const real* GPU_FUNC_ARGUMENT(chargesA), + const real* GPU_FUNC_ARGUMENT(chargesB) = nullptr) GPU_FUNC_TERM; /*! \brief \libinternal * The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing. @@ -534,11 +566,13 @@ void pme_gpu_reinit_computation(const PmeGpu* pmeGpu); * * \param[in] pme The PME data structure. * \param[in] computeEnergyAndVirial Tells if the energy and virial computation should be performed. + * \param[in] lambdaQ The Coulomb lambda to use when calculating the results. * \param[out] wcycle The wallclock counter. * \return The output forces, energy and virial */ GPU_FUNC_QUALIFIER PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* GPU_FUNC_ARGUMENT(pme), bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial), + real GPU_FUNC_ARGUMENT(lambdaQ), gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM_WITH_RETURN(PmeOutput{}); diff --git a/src/gromacs/ewald/pme_gpu_program_impl.cu b/src/gromacs/ewald/pme_gpu_program_impl.cu index 84bac0a467..1358793807 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl.cu +++ b/src/gromacs/ewald/pme_gpu_program_impl.cu @@ -55,53 +55,79 @@ constexpr int c_pmeOrder = 4; constexpr bool c_wrapX = true; constexpr bool c_wrapY = true; +constexpr int c_stateA = 0; +constexpr int c_stateB = 1; + //! PME CUDA kernels forward declarations. Kernels are documented in their respective files. -template +template void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. // clang-format off extern template void -pme_spline_and_spread_kernel (const PmeGpuCudaKernelParams); +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); extern template void -pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); extern template void -pme_spline_and_spread_kernel (const PmeGpuCudaKernelParams); +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); extern template void -pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); extern template void -pme_spline_and_spread_kernel (const PmeGpuCudaKernelParams); +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); extern template void -pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); extern template void -pme_spline_and_spread_kernel (const PmeGpuCudaKernelParams); +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); extern template void -pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -// clang-format on +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template void +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template void +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template void +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template void +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template void +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template void +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template void +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); +extern template void +pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams); -template +template /* It is significantly slower to pass gridIndex as a kernel parameter */ void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. // clang-format off -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel (const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); -extern template void pme_solve_kernel (const PmeGpuCudaKernelParams); +extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); +extern template void pme_solve_kernel(const PmeGpuCudaKernelParams); // clang-format on -template +template void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams); // Add extern declarations to inform that there will be a definition // provided in another translation unit. // clang-format off -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel(const PmeGpuCudaKernelParams); -extern template void pme_gather_kernel(const PmeGpuCudaKernelParams); +extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template void pme_gather_kernel(const PmeGpuCudaKernelParams); +extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); +extern template void pme_gather_kernel (const PmeGpuCudaKernelParams); // clang-format on PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) : @@ -118,22 +144,38 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) : * Similarly the spread kernel (without the spline) implies that we should read the spline data from global memory */ // clang-format off - splineAndSpreadKernel = pme_spline_and_spread_kernel; - splineAndSpreadKernelThPerAtom4 = pme_spline_and_spread_kernel; - splineAndSpreadKernelWriteSplines = pme_spline_and_spread_kernel; - splineAndSpreadKernelWriteSplinesThPerAtom4 = pme_spline_and_spread_kernel; - splineKernel = pme_spline_and_spread_kernel; - splineKernelThPerAtom4 = pme_spline_and_spread_kernel; - spreadKernel = pme_spline_and_spread_kernel; - spreadKernelThPerAtom4 = pme_spline_and_spread_kernel; - gatherKernel = pme_gather_kernel; - gatherKernelThPerAtom4 = pme_gather_kernel; - gatherKernelReadSplines = pme_gather_kernel; - gatherKernelReadSplinesThPerAtom4 = pme_gather_kernel; - solveXYZKernel = pme_solve_kernel; - solveXYZEnergyKernel = pme_solve_kernel; - solveYZXKernel = pme_solve_kernel; - solveYZXEnergyKernel = pme_solve_kernel; + splineAndSpreadKernelSingle = pme_spline_and_spread_kernel; + splineAndSpreadKernelThPerAtom4Single = pme_spline_and_spread_kernel; + splineAndSpreadKernelWriteSplinesSingle = pme_spline_and_spread_kernel; + splineAndSpreadKernelWriteSplinesThPerAtom4Single = pme_spline_and_spread_kernel; + splineKernelSingle = pme_spline_and_spread_kernel; + splineKernelThPerAtom4Single = pme_spline_and_spread_kernel; + spreadKernelSingle = pme_spline_and_spread_kernel; + spreadKernelThPerAtom4Single = pme_spline_and_spread_kernel; + splineAndSpreadKernelDual = pme_spline_and_spread_kernel; + splineAndSpreadKernelThPerAtom4Dual = pme_spline_and_spread_kernel; + splineAndSpreadKernelWriteSplinesDual = pme_spline_and_spread_kernel; + splineAndSpreadKernelWriteSplinesThPerAtom4Dual = pme_spline_and_spread_kernel; + splineKernelDual = pme_spline_and_spread_kernel; + splineKernelThPerAtom4Dual = pme_spline_and_spread_kernel; + spreadKernelDual = pme_spline_and_spread_kernel; + spreadKernelThPerAtom4Dual = pme_spline_and_spread_kernel; + gatherKernelSingle = pme_gather_kernel; + gatherKernelThPerAtom4Single = pme_gather_kernel; + gatherKernelReadSplinesSingle = pme_gather_kernel; + gatherKernelReadSplinesThPerAtom4Single = pme_gather_kernel; + gatherKernelDual = pme_gather_kernel; + gatherKernelThPerAtom4Dual = pme_gather_kernel; + gatherKernelReadSplinesDual = pme_gather_kernel; + gatherKernelReadSplinesThPerAtom4Dual = pme_gather_kernel; + solveXYZKernelA = pme_solve_kernel; + solveXYZEnergyKernelA = pme_solve_kernel; + solveYZXKernelA = pme_solve_kernel; + solveYZXEnergyKernelA = pme_solve_kernel; + solveXYZKernelB = pme_solve_kernel; + solveXYZEnergyKernelB = pme_solve_kernel; + solveYZXKernelB = pme_solve_kernel; + solveYZXEnergyKernelB = pme_solve_kernel; // clang-format on } diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h index 0a7cc1a5c5..75d3f881d0 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl.h +++ b/src/gromacs/ewald/pme_gpu_program_impl.h @@ -105,17 +105,28 @@ struct PmeGpuProgramImpl * or recalculated in the gather. * Spreading kernels also have hardcoded X/Y indices wrapping parameters, * as a placeholder for implementing 1/2D decomposition. + * The kernels are templated separately for spreading on one grid (one or + * two sets of coefficients) or on two grids (required for energy and virial + * calculations). */ size_t spreadWorkGroupSize; - PmeKernelHandle splineKernel; - PmeKernelHandle splineKernelThPerAtom4; - PmeKernelHandle spreadKernel; - PmeKernelHandle spreadKernelThPerAtom4; - PmeKernelHandle splineAndSpreadKernel; - PmeKernelHandle splineAndSpreadKernelThPerAtom4; - PmeKernelHandle splineAndSpreadKernelWriteSplines; - PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4; + PmeKernelHandle splineKernelSingle; + PmeKernelHandle splineKernelThPerAtom4Single; + PmeKernelHandle spreadKernelSingle; + PmeKernelHandle spreadKernelThPerAtom4Single; + PmeKernelHandle splineAndSpreadKernelSingle; + PmeKernelHandle splineAndSpreadKernelThPerAtom4Single; + PmeKernelHandle splineAndSpreadKernelWriteSplinesSingle; + PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4Single; + PmeKernelHandle splineKernelDual; + PmeKernelHandle splineKernelThPerAtom4Dual; + PmeKernelHandle spreadKernelDual; + PmeKernelHandle spreadKernelThPerAtom4Dual; + PmeKernelHandle splineAndSpreadKernelDual; + PmeKernelHandle splineAndSpreadKernelThPerAtom4Dual; + PmeKernelHandle splineAndSpreadKernelWriteSplinesDual; + PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4Dual; //@} //@{ @@ -123,25 +134,36 @@ struct PmeGpuProgramImpl * it can either reduce with previous forces in the host buffer, or ignore them. * Also similarly to the gather we can use either order(4) or order*order (16) threads per atom * and either recalculate the splines or read the ones written by the spread + * The kernels are templated separately for using one or two grids (required for + * calculating energies and virial). */ size_t gatherWorkGroupSize; - PmeKernelHandle gatherKernel; - PmeKernelHandle gatherKernelThPerAtom4; - PmeKernelHandle gatherKernelReadSplines; - PmeKernelHandle gatherKernelReadSplinesThPerAtom4; + PmeKernelHandle gatherKernelSingle; + PmeKernelHandle gatherKernelThPerAtom4Single; + PmeKernelHandle gatherKernelReadSplinesSingle; + PmeKernelHandle gatherKernelReadSplinesThPerAtom4Single; + PmeKernelHandle gatherKernelDual; + PmeKernelHandle gatherKernelThPerAtom4Dual; + PmeKernelHandle gatherKernelReadSplinesDual; + PmeKernelHandle gatherKernelReadSplinesThPerAtom4Dual; //@} //@{ /** Solve kernel doesn't care about the interpolation order, but can optionally * compute energy and virial, and supports XYZ and YZX grid orderings. + * The kernels are templated separately for grids in state A and B. */ size_t solveMaxWorkGroupSize; - PmeKernelHandle solveYZXKernel; - PmeKernelHandle solveXYZKernel; - PmeKernelHandle solveYZXEnergyKernel; - PmeKernelHandle solveXYZEnergyKernel; + PmeKernelHandle solveYZXKernelA; + PmeKernelHandle solveXYZKernelA; + PmeKernelHandle solveYZXEnergyKernelA; + PmeKernelHandle solveXYZEnergyKernelA; + PmeKernelHandle solveYZXKernelB; + PmeKernelHandle solveXYZKernelB; + PmeKernelHandle solveYZXEnergyKernelB; + PmeKernelHandle solveXYZEnergyKernelB; //@} PmeGpuProgramImpl() = delete; diff --git a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp index 6be367b6b3..c88baf40e5 100644 --- a/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp +++ b/src/gromacs/ewald/pme_gpu_program_impl_ocl.cpp @@ -72,14 +72,22 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl() { // TODO: log releasing errors cl_int gmx_used_in_debug stat = 0; - stat |= clReleaseKernel(splineAndSpreadKernel); - stat |= clReleaseKernel(splineKernel); - stat |= clReleaseKernel(spreadKernel); - stat |= clReleaseKernel(gatherKernel); - stat |= clReleaseKernel(solveXYZKernel); - stat |= clReleaseKernel(solveXYZEnergyKernel); - stat |= clReleaseKernel(solveYZXKernel); - stat |= clReleaseKernel(solveYZXEnergyKernel); + stat |= clReleaseKernel(splineAndSpreadKernelSingle); + stat |= clReleaseKernel(splineKernelSingle); + stat |= clReleaseKernel(spreadKernelSingle); + stat |= clReleaseKernel(splineAndSpreadKernelDual); + stat |= clReleaseKernel(splineKernelDual); + stat |= clReleaseKernel(spreadKernelDual); + stat |= clReleaseKernel(gatherKernelSingle); + stat |= clReleaseKernel(gatherKernelDual); + stat |= clReleaseKernel(solveXYZKernelA); + stat |= clReleaseKernel(solveXYZEnergyKernelA); + stat |= clReleaseKernel(solveYZXKernelA); + stat |= clReleaseKernel(solveYZXEnergyKernelA); + stat |= clReleaseKernel(solveXYZKernelB); + stat |= clReleaseKernel(solveXYZEnergyKernelB); + stat |= clReleaseKernel(solveYZXKernelB); + stat |= clReleaseKernel(solveYZXEnergyKernelB); GMX_ASSERT(stat == CL_SUCCESS, gmx::formatString("Failed to release PME OpenCL resources %d: %s", stat, ocl_get_error_string(stat).c_str()) @@ -163,7 +171,7 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo) } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR; - constexpr cl_uint expectedKernelCount = 9; + constexpr cl_uint expectedKernelCount = 18; // Has to be equal or larger than the number of kernel instances. // If it is not, CL_INVALID_VALUE will be thrown. std::vector kernels(expectedKernelCount, nullptr); @@ -193,42 +201,79 @@ void PmeGpuProgramImpl::compileKernels(const DeviceInformation& deviceInfo) // The names below must correspond to those defined in pme_program.cl // TODO use a map with string key instead? - if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernel")) + if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernelSingle")) { - splineKernel = kernel; + splineKernelSingle = kernel; } - else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel")) + else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernelSingle")) { - splineAndSpreadKernel = kernel; - splineAndSpreadKernelWriteSplines = kernel; - checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo); + splineAndSpreadKernelSingle = kernel; + splineAndSpreadKernelWriteSplinesSingle = kernel; + checkRequiredWarpSize(splineAndSpreadKernelSingle, kernelNamesBuffer.data(), deviceInfo); } - else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel")) + else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernelSingle")) { - spreadKernel = kernel; - checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo); + spreadKernelSingle = kernel; + checkRequiredWarpSize(spreadKernelSingle, kernelNamesBuffer.data(), deviceInfo); } - else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel")) + else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernelDual")) { - gatherKernel = kernel; - gatherKernelReadSplines = kernel; - checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo); + splineKernelDual = kernel; } - else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel")) + else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernelDual")) { - solveYZXKernel = kernel; + splineAndSpreadKernelDual = kernel; + splineAndSpreadKernelWriteSplinesDual = kernel; + checkRequiredWarpSize(splineAndSpreadKernelDual, kernelNamesBuffer.data(), deviceInfo); } - else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernel")) + else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernelDual")) { - solveYZXEnergyKernel = kernel; + spreadKernelDual = kernel; + checkRequiredWarpSize(spreadKernelDual, kernelNamesBuffer.data(), deviceInfo); } - else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernel")) + else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernelSingle")) { - solveXYZKernel = kernel; + gatherKernelSingle = kernel; + gatherKernelReadSplinesSingle = kernel; + checkRequiredWarpSize(gatherKernelSingle, kernelNamesBuffer.data(), deviceInfo); } - else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernel")) + else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernelDual")) { - solveXYZEnergyKernel = kernel; + gatherKernelDual = kernel; + gatherKernelReadSplinesDual = kernel; + checkRequiredWarpSize(gatherKernelDual, kernelNamesBuffer.data(), deviceInfo); + } + else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernelA")) + { + solveYZXKernelA = kernel; + } + else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernelA")) + { + solveYZXEnergyKernelA = kernel; + } + else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernelA")) + { + solveXYZKernelA = kernel; + } + else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernelA")) + { + solveXYZEnergyKernelA = kernel; + } + else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernelB")) + { + solveYZXKernelB = kernel; + } + else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernelB")) + { + solveYZXEnergyKernelB = kernel; + } + else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernelB")) + { + solveXYZKernelB = kernel; + } + else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernelB")) + { + solveXYZEnergyKernelB = kernel; } } clReleaseProgram(program); diff --git a/src/gromacs/ewald/pme_gpu_staging.h b/src/gromacs/ewald/pme_gpu_staging.h index 4a95c0cda8..7439027639 100644 --- a/src/gromacs/ewald/pme_gpu_staging.h +++ b/src/gromacs/ewald/pme_gpu_staging.h @@ -54,6 +54,11 @@ #include "gromacs/gpu_utils/hostallocator.h" #include "gromacs/math/vectypes.h" +#ifndef NUM_STATES +//! Number of FEP states. +# define NUM_STATES 2 +#endif + /*! \internal \brief * The PME GPU intermediate buffers structure, included in the main PME GPU structure by value. * Buffers are managed by the PME GPU module. @@ -64,9 +69,9 @@ struct PmeGpuStaging gmx::PaddedHostVector h_forces; /*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */ - float* h_virialAndEnergy; + float* h_virialAndEnergy[NUM_STATES]; /*! \brief B-spline values intermediate host-side buffer. */ - float* h_splineModuli; + float* h_splineModuli[NUM_STATES]; /*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */ float* h_theta; diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h index 97f8416499..4274f095d9 100644 --- a/src/gromacs/ewald/pme_gpu_types.h +++ b/src/gromacs/ewald/pme_gpu_types.h @@ -82,6 +82,11 @@ static_assert(sizeof(DeviceBuffer) == 8, # define HIDE_FROM_OPENCL_COMPILER(x) char8 #endif +#ifndef NUMFEPSTATES +//! Number of FEP states. +# define NUMFEPSTATES 2 +#endif + /* What follows is all the PME GPU function arguments, * sorted into several device-side structures depending on the update rate. * This is GPU agnostic (float3 replaced by float[3], etc.). @@ -99,7 +104,7 @@ struct PmeGpuConstParams float elFactor; /*! \brief Virial and energy GPU array. Size is c_virialAndEnergyCount (7) floats. * The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */ - HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_virialAndEnergy; + HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_virialAndEnergy[NUMFEPSTATES]; }; /*! \internal \brief @@ -130,14 +135,14 @@ struct PmeGpuGridParams /* Grid arrays */ /*! \brief Real space grid. */ - HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_realGrid; + HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_realGrid[NUMFEPSTATES]; /*! \brief Complex grid - used in FFT/solve. If inplace cu/clFFT is used, then it is the same handle as realGrid. */ - HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_fourierGrid; + HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_fourierGrid[NUMFEPSTATES]; /*! \brief Grid spline values as in pme->bsp_mod * (laid out sequentially (XXX....XYYY......YZZZ.....Z)) */ - HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_splineModuli; + HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_splineModuli[NUMFEPSTATES]; /*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */ HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_fractShiftsTable; /*! \brief Gridline indices lookup table @@ -158,10 +163,10 @@ struct PmeGpuAtomParams * but reallocation happens only at DD. */ HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_coordinates; - /*! \brief Global GPU memory array handle with input atom charges. + /*! \brief Global GPU memory array handle with input atom charges in states A and B. * The charges only need to be reallocated and copied to the GPU at DD step. */ - HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_coefficients; + HIDE_FROM_OPENCL_COMPILER(DeviceBuffer) d_coefficients[NUMFEPSTATES]; /*! \brief Global GPU memory array handle with input/output rvec atom forces. * The forces change and need to be copied from (and possibly to) the GPU for every PME * computation, but reallocation happens only at DD. @@ -196,6 +201,9 @@ struct PmeGpuDynamicParams float recipBox[DIM][DIM]; /*! \brief The unit cell volume for solving. */ float boxVolume; + + /*! \brief The current coefficient scaling value. */ + float scale; }; /*! \internal \brief diff --git a/src/gromacs/ewald/pme_gpu_types_host.h b/src/gromacs/ewald/pme_gpu_types_host.h index 9d7e2f78f9..f8e42f7fb6 100644 --- a/src/gromacs/ewald/pme_gpu_types_host.h +++ b/src/gromacs/ewald/pme_gpu_types_host.h @@ -100,7 +100,7 @@ struct DeviceInformation; */ struct PmeShared { - /*! \brief Grid count - currently always 1 on GPU */ + /*! \brief Grid count */ int ngrids; /*! \brief Grid dimensions - nkx, nky, nkz */ int nk[DIM]; diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h index e134d2c0a7..f3deae2842 100644 --- a/src/gromacs/ewald/pme_gpu_types_host_impl.h +++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h @@ -62,6 +62,11 @@ #include "pme_gpu_3dfft.h" +#ifndef NUMFEPSTATES +//! Number of FEP states. +# define NUMFEPSTATES 2 +#endif + class GpuParallel3dFft; /*! \internal \brief @@ -141,21 +146,21 @@ struct PmeGpuSpecific /*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */ int splineDataSizeAlloc = 0; /*! \brief The kernelParams.atoms.coefficients float element count (actual) */ - int coefficientsSize = 0; + int coefficientsSize[NUMFEPSTATES] = { 0, 0 }; /*! \brief The kernelParams.atoms.coefficients float element count (reserved) */ - int coefficientsSizeAlloc = 0; + int coefficientsCapacity[NUMFEPSTATES] = { 0, 0 }; /*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */ - int splineValuesSize = 0; + int splineValuesSize[NUMFEPSTATES] = { 0, 0 }; /*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */ - int splineValuesSizeAlloc = 0; + int splineValuesCapacity[NUMFEPSTATES] = { 0, 0 }; /*! \brief The kernelParams.grid.realGrid float element count (actual) */ - int realGridSize = 0; + int realGridSize[NUMFEPSTATES] = { 0, 0 }; /*! \brief The kernelParams.grid.realGrid float element count (reserved) */ - int realGridSizeAlloc = 0; + int realGridCapacity[NUMFEPSTATES] = { 0, 0 }; /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */ - int complexGridSize = 0; + int complexGridSize[NUMFEPSTATES] = { 0, 0 }; /*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */ - int complexGridSizeAlloc = 0; + int complexGridCapacity[NUMFEPSTATES] = { 0, 0 }; }; #endif diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index a6e4f23183..1a33cc8ed0 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -418,7 +418,7 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, { if (atomSetChanged) { - gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA.data()); + gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA.data(), pme_pp->chargeB.data()); if (useGpuForPme) { stateGpu->reinit(nat, nat); @@ -707,7 +707,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, stepWork.computeVirial = computeEnergyAndVirial; stepWork.computeEnergy = computeEnergyAndVirial; stepWork.computeForces = true; - PmeOutput output; + PmeOutput output = { {}, false, 0, { { 0 } }, 0, 0, { { 0 } }, 0 }; if (useGpuForPme) { stepWork.haveDynamicBox = false; @@ -723,10 +723,10 @@ int gmx_pmeonly(struct gmx_pme_t* pme, // TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157 auto xReadyOnDevice = nullptr; - pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle); + pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle, lambda_q); pme_gpu_launch_complex_transforms(pme, wcycle, stepWork); - pme_gpu_launch_gather(pme, wcycle); - output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, wcycle); + pme_gpu_launch_gather(pme, wcycle, lambda_q); + output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle); pme_gpu_reinit_computation(pme, wcycle); } else diff --git a/src/gromacs/ewald/pme_output.h b/src/gromacs/ewald/pme_output.h index 6567341a1e..bc8a501fce 100644 --- a/src/gromacs/ewald/pme_output.h +++ b/src/gromacs/ewald/pme_output.h @@ -61,10 +61,14 @@ struct PmeOutput real coulombEnergy_ = 0; //!< Host staging area for PME coulomb virial contributions matrix coulombVirial_ = { { 0 } }; + //!< Host staging area for PME coulomb dVdl. + real coulombDvdl_ = 0; //!< Host staging area for PME LJ energy real lennardJonesEnergy_ = 0; //!< Host staging area for PME LJ virial contributions matrix lennardJonesVirial_ = { { 0 } }; + //!< Host staging area for PME LJ dVdl. (Not used) + real lennardJonesDvdl_ = 0; }; #endif diff --git a/src/gromacs/ewald/pme_program.cl b/src/gromacs/ewald/pme_program.cl index 6b6aab6d2d..354aa12e58 100644 --- a/src/gromacs/ewald/pme_program.cl +++ b/src/gromacs/ewald/pme_program.cl @@ -59,31 +59,70 @@ #define atomsPerBlock (c_spreadWorkGroupSize / threadsPerAtom) #define atomsPerWarp (warp_size / threadsPerAtom) -// spline/spread fused +// spline/spread fused single grid (without FEP or FEP without energy and virial calculation at this step) #define computeSplines 1 #define spreadCharges 1 -#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernel +#define numGrids 1 /* Using define to avoid a conditional inside the function. */ +#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernelSingle #include "pme_spread.clh" #undef computeSplines #undef spreadCharges +#undef numGrids #undef CUSTOMIZED_KERNEL_NAME -// spline +// spline on single grid (without FEP or FEP without energy and virial calculation at this step) #define computeSplines 1 #define spreadCharges 0 -#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernel +#define numGrids 1 /* Using define to avoid a conditional inside the function. */ +#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernelSingle #include "pme_spread.clh" #undef computeSplines #undef spreadCharges +#undef numGrids #undef CUSTOMIZED_KERNEL_NAME -// spread +// spread on single grid (without FEP or FEP without energy and virial calculation at this step) #define computeSplines 0 #define spreadCharges 1 -#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernel +#define numGrids 1 /* Using define to avoid a conditional inside the function. */ +#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernelSingle #include "pme_spread.clh" #undef computeSplines #undef spreadCharges +#undef numGrids +#undef CUSTOMIZED_KERNEL_NAME + +// spline/spread fused on two grids (FEP with energy and virial calculation at this step) +#define computeSplines 1 +#define spreadCharges 1 +#define numGrids 2 /* Using define to avoid a conditional inside the function. */ +#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernelDual +#include "pme_spread.clh" +#undef computeSplines +#undef spreadCharges +#undef numGrids +#undef CUSTOMIZED_KERNEL_NAME + +// spline on two grids (FEP with energy and virial calculation at this step) +#define computeSplines 1 +#define spreadCharges 0 +#define numGrids 2 /* Using define to avoid a conditional inside the function. */ +#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernelDual +#include "pme_spread.clh" +#undef computeSplines +#undef spreadCharges +#undef numGrids +#undef CUSTOMIZED_KERNEL_NAME + +// spread on two grids (FEP with energy and virial calculation at this step) +#define computeSplines 0 +#define spreadCharges 1 +#define numGrids 2 /* Using define to avoid a conditional inside the function. */ +#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernelDual +#include "pme_spread.clh" +#undef computeSplines +#undef spreadCharges +#undef numGrids #undef CUSTOMIZED_KERNEL_NAME @@ -94,11 +133,19 @@ #define atomsPerBlock (c_gatherWorkGroupSize / threadsPerAtom) -// gather -#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernel +// gather using single grid +#define numGrids 1 /* Using define to avoid a conditional inside the function. */ +#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernelSingle #include "pme_gather.clh" +#undef numGrids #undef CUSTOMIZED_KERNEL_NAME +// gather using two grids +#define numGrids 2 /* Using define to avoid a conditional inside the function. */ +#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernelDual +#include "pme_gather.clh" +#undef numGrids +#undef CUSTOMIZED_KERNEL_NAME #undef atomsPerBlock @@ -108,37 +155,73 @@ #define YZX 1 #define XYZ 2 -// solve, YZX dimension order +// solve, YZX dimension order state A +#define gridOrdering YZX +#define computeEnergyAndVirial 0 +#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernelA +#include "pme_solve.clh" +#undef gridOrdering +#undef computeEnergyAndVirial +#undef CUSTOMIZED_KERNEL_NAME + +// solve with reduction, YZX dimension order state A +#define gridOrdering YZX +#define computeEnergyAndVirial 1 +#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernelA +#include "pme_solve.clh" +#undef gridOrdering +#undef computeEnergyAndVirial +#undef CUSTOMIZED_KERNEL_NAME + +// solve, XYZ dimension order state A +#define gridOrdering XYZ +#define computeEnergyAndVirial 0 +#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernelA +#include "pme_solve.clh" +#undef gridOrdering +#undef computeEnergyAndVirial +#undef CUSTOMIZED_KERNEL_NAME + +// solve with reduction, XYZ dimension order state A +#define gridOrdering XYZ +#define computeEnergyAndVirial 1 +#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernelA +#include "pme_solve.clh" +#undef gridOrdering +#undef computeEnergyAndVirial +#undef CUSTOMIZED_KERNEL_NAME + +// solve, YZX dimension order state B #define gridOrdering YZX #define computeEnergyAndVirial 0 -#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernel +#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernelB #include "pme_solve.clh" #undef gridOrdering #undef computeEnergyAndVirial #undef CUSTOMIZED_KERNEL_NAME -// solve with reduction, YZX dimension order +// solve with reduction, YZX dimension order state B #define gridOrdering YZX #define computeEnergyAndVirial 1 -#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernel +#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernelB #include "pme_solve.clh" #undef gridOrdering #undef computeEnergyAndVirial #undef CUSTOMIZED_KERNEL_NAME -// solve, XYZ dimension order +// solve, XYZ dimension order state B #define gridOrdering XYZ #define computeEnergyAndVirial 0 -#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernel +#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernelB #include "pme_solve.clh" #undef gridOrdering #undef computeEnergyAndVirial #undef CUSTOMIZED_KERNEL_NAME -// solve with reduction, XYZ dimension order +// solve with reduction, XYZ dimension order state B #define gridOrdering XYZ #define computeEnergyAndVirial 1 -#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernel +#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernelB #include "pme_solve.clh" #undef gridOrdering #undef computeEnergyAndVirial diff --git a/src/gromacs/ewald/pme_solve.clh b/src/gromacs/ewald/pme_solve.clh index 2c4dbde609..0c21512b6f 100644 --- a/src/gromacs/ewald/pme_solve.clh +++ b/src/gromacs/ewald/pme_solve.clh @@ -60,7 +60,8 @@ * \param[in] kernelParams Input PME GPU data in constant memory. * \param[in] gm_splineModuli B-Spline moduli. * \param[out] gm_virialAndEnergy Reduced virial and enrgy (only with computeEnergyAndVirial == - * true) \param[in,out] gm_grid Fourier grid to transform. + * true) + * \param[in,out] gm_grid Fourier grid to transform. */ __attribute__((work_group_size_hint(c_solveMaxWorkGroupSize, 1, 1))) __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKernelParams kernelParams, @@ -137,8 +138,9 @@ __kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKer & (gridLineIndex < gridLinesPerBlock)) { /* The offset should be equal to the global thread index for coalesced access */ - const int gridIndex = (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor; - __global float2* __restrict__ gm_gridCell = gm_grid + gridIndex; + const int gridThreadIndex = + (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor; + __global float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex; const int kMajor = indexMajor + localOffsetMajor; /* Checking either X in XYZ, or Y in YZX cases */ diff --git a/src/gromacs/ewald/pme_solve.cu b/src/gromacs/ewald/pme_solve.cu index 784de81f96..3f5d2d06f4 100644 --- a/src/gromacs/ewald/pme_solve.cu +++ b/src/gromacs/ewald/pme_solve.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -53,10 +53,11 @@ * PME complex grid solver kernel function. * * \tparam[in] gridOrdering Specifies the dimension ordering of the complex grid. - * \tparam[in] computeEnergyAndVirial Tells if the reciprocal energy and virial should be - * computed. \param[in] kernelParams Input PME CUDA data in constant memory. + * \tparam[in] computeEnergyAndVirial Tells if the reciprocal energy and virial should be computed. + * \tparam[in] gridIndex The index of the grid to use in the kernel. + * \param[in] kernelParams Input PME CUDA data in constant memory. */ -template +template __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__ void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParams) { @@ -80,14 +81,14 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT } /* Global memory pointers */ - const float* __restrict__ gm_splineValueMajor = - kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[majorDim]; - const float* __restrict__ gm_splineValueMiddle = - kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[middleDim]; - const float* __restrict__ gm_splineValueMinor = - kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[minorDim]; - float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy; - float2* __restrict__ gm_grid = (float2*)kernelParams.grid.d_fourierGrid; + const float* __restrict__ gm_splineValueMajor = kernelParams.grid.d_splineModuli[gridIndex] + + kernelParams.grid.splineValuesOffset[majorDim]; + const float* __restrict__ gm_splineValueMiddle = kernelParams.grid.d_splineModuli[gridIndex] + + kernelParams.grid.splineValuesOffset[middleDim]; + const float* __restrict__ gm_splineValueMinor = kernelParams.grid.d_splineModuli[gridIndex] + + kernelParams.grid.splineValuesOffset[minorDim]; + float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy[gridIndex]; + float2* __restrict__ gm_grid = (float2*)kernelParams.grid.d_fourierGrid[gridIndex]; /* Various grid sizes and indices */ const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused @@ -131,8 +132,9 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT & (gridLineIndex < gridLinesPerBlock)) { /* The offset should be equal to the global thread index for coalesced access */ - const int gridIndex = (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor; - float2* __restrict__ gm_gridCell = gm_grid + gridIndex; + const int gridThreadIndex = + (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor; + float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex; const int kMajor = indexMajor + localOffsetMajor; /* Checking either X in XYZ, or Y in YZX cases */ @@ -352,7 +354,11 @@ __launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUT } //! Kernel instantiations -template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); -template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); -template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); -template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); +template __global__ void pme_solve_kernel(const PmeGpuCudaKernelParams); diff --git a/src/gromacs/ewald/pme_spread.clh b/src/gromacs/ewald/pme_spread.clh index 91e36c2518..3342120d51 100644 --- a/src/gromacs/ewald/pme_spread.clh +++ b/src/gromacs/ewald/pme_spread.clh @@ -75,7 +75,8 @@ * * \param[out] sm_destination Local memory array for output. * \param[in] gm_source Global memory array for input. - * \param[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec coordinates array). + * \param[in] dataCountPerAtom Number of data elements per single + * atom (e.g. DIM for an rvec coordinates array). * */ inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination, @@ -113,7 +114,7 @@ inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination, * \param[out] gm_dtheta Atom spline parameter derivatives * \param[out] gm_gridlineIndices Same as \p sm_gridlineIndices but in global memory. * \param[in] gm_fractShiftsTable Atom fractional coordinates correction table - * \param[in] gm_gridlineIndicesTable Atom fractional coordinates correction table + * \param[in] gm_gridlineIndicesTable Table with atom gridline indices */ gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams, const int atomIndexOffset, @@ -404,10 +405,17 @@ gmx_opencl_inline void spread_charges(const struct PmeOpenCLKernelParams kernelP * \param[in,out] gm_theta Atom spline parameter values * \param[out] gm_dtheta Atom spline parameter derivatives * \param[in,out] gm_gridlineIndices Atom gridline indices (ivec) - * \param[out] gm_grid Global 3D grid for charge spreading. + * \param[out] gm_gridA Global 3D grid for charge spreading. + * Grid for the unperturbed state, FEP state A or the single grid used for + * interpolated coefficients on one grid in FEP A/B. + * \param[out] gm_gridB Global 3D grid for charge spreading. + * FEP state B when using dual grids (when calculating energy and virials). * \param[in] gm_fractShiftsTable Atom fractional coordinates correction table - * \param[in] gm_gridlineIndicesTable Atom fractional coordinates correction table - * \param[in] gm_coefficients Atom charges/coefficients. + * \param[in] gm_gridlineIndicesTable Table with atom gridline indices + * \param[in] gm_coefficientsA Atom charges/coefficients of the unperturbed state + * or FEP state A. + * \param[in] gm_coefficientsB Atom charges/coefficients in FEP state B. Only + * used when spreading interpolated coefficients on one grid. * \param[in] gm_coordinates Atom coordinates (rvec) */ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void CUSTOMIZED_KERNEL_NAME( @@ -415,10 +423,12 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void __global float* __restrict__ gm_theta, __global float* __restrict__ gm_dtheta, __global int* __restrict__ gm_gridlineIndices, - __global float* __restrict__ gm_grid, + __global float* __restrict__ gm_gridA, + __global float* __restrict__ gm_gridB, __global const float* __restrict__ gm_fractShiftsTable, __global const int* __restrict__ gm_gridlineIndicesTable, - __global const float* __restrict__ gm_coefficients, + __global const float* __restrict__ gm_coefficientsA, + __global const float* __restrict__ gm_coefficientsB, __global const float* __restrict__ gm_coordinates) { // Gridline indices, ivec @@ -433,9 +443,10 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void __local float sm_coordinates[DIM * atomsPerBlock]; const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock; + int gridIndex = 0; /* Staging coefficients/charges for both spline and spread */ - pme_gpu_stage_atom_data(sm_coefficients, gm_coefficients, 1); + pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsA, 1); if (computeSplines) { @@ -470,6 +481,17 @@ __attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void /* Spreading */ if (spreadCharges) { - spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_grid); + spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_gridA); + } + if (numGrids == 2) + { + barrier(CLK_LOCAL_MEM_FENCE); + gridIndex = 1; + pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsB, 1); + barrier(CLK_LOCAL_MEM_FENCE); + if (spreadCharges) + { + spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_gridB); + } } } diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu index 806dddd4f8..3cf3a1f7ca 100644 --- a/src/gromacs/ewald/pme_spread.cu +++ b/src/gromacs/ewald/pme_spread.cu @@ -53,6 +53,17 @@ #include "pme_gpu_calculate_splines.cuh" #include "pme_grid.h" +/* + * This define affects the spline calculation behaviour in the kernel. + * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing + * (order) spline values and derivatives). 1: (order) threads do redundant work on this same task, + * each one stores only a single theta and single dtheta into global arrays. The only efficiency + * difference is less global store operations, countered by more redundant spline computation. + * + * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so). + */ +#define PME_GPU_PARALLEL_SPLINE 0 + /*! \brief * Charge spreading onto the grid. * This corresponds to the CPU function spread_coefficients_bsplines_thread(). @@ -61,6 +72,7 @@ * \tparam[in] order PME interpolation order. * \tparam[in] wrapX Whether the grid overlap in dimension X should be wrapped. * \tparam[in] wrapY Whether the grid overlap in dimension Y should be wrapped. + * \tparam[in] gridIndex The index of the grid to use in the kernel. * \tparam[in] threadsPerAtom How many threads work on each atom * * \param[in] kernelParams Input PME CUDA data in constant memory. @@ -68,14 +80,14 @@ * \param[in] sm_gridlineIndices Atom gridline indices in the shared memory. * \param[in] sm_theta Atom spline values in the shared memory. */ -template +template __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams, const float* atomCharge, const int* __restrict__ sm_gridlineIndices, const float* __restrict__ sm_theta) { /* Global memory pointer to the output grid */ - float* __restrict__ gm_grid = kernelParams.grid.d_realGrid; + float* __restrict__ gm_grid = kernelParams.grid.d_realGrid[gridIndex]; // Number of atoms processed by a single warp in spread and gather const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order; @@ -163,11 +175,12 @@ __device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kern * \tparam[in] spreadCharges A boolean which tells if the charge spreading should be performed. * \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped. * \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped. + * \tparam[in] numGrids The number of grids to use in the kernel. Can be 1 or 2. * \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory. * \tparam[in] threadsPerAtom How many threads work on each atom * \param[in] kernelParams Input PME CUDA data in constant memory. */ -template +template __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__ void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams) { @@ -177,6 +190,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU const int atomsPerWarp = warp_size / threadsPerAtomValue; // Gridline indices, ivec __shared__ int sm_gridlineIndices[atomsPerBlock * DIM]; + // Charges + __shared__ float sm_coefficients[atomsPerBlock]; // Spline values __shared__ float sm_theta[atomsPerBlock * DIM * order]; float dtheta; @@ -210,14 +225,14 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU /* Charges, required for both spline and spread */ if (c_useAtomDataPrefetch) { - __shared__ float sm_coefficients[atomsPerBlock]; - pme_gpu_stage_atom_data(sm_coefficients, kernelParams.atoms.d_coefficients); + pme_gpu_stage_atom_data(sm_coefficients, + kernelParams.atoms.d_coefficients[0]); __syncthreads(); atomCharge = sm_coefficients[atomIndexLocal]; } else { - atomCharge = kernelParams.atoms.d_coefficients[atomIndexGlobal]; + atomCharge = kernelParams.atoms.d_coefficients[0][atomIndexGlobal]; } if (computeSplines) @@ -259,19 +274,47 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU /* Spreading */ if (spreadCharges) { - spread_charges(kernelParams, &atomCharge, - sm_gridlineIndices, sm_theta); + spread_charges(kernelParams, &atomCharge, + sm_gridlineIndices, sm_theta); + } + if (numGrids == 2) + { + __syncthreads(); + if (c_useAtomDataPrefetch) + { + pme_gpu_stage_atom_data(sm_coefficients, + kernelParams.atoms.d_coefficients[1]); + __syncthreads(); + atomCharge = sm_coefficients[atomIndexLocal]; + } + else + { + atomCharge = kernelParams.atoms.d_coefficients[1][atomIndexGlobal]; + } + if (spreadCharges) + { + spread_charges(kernelParams, &atomCharge, + sm_gridlineIndices, sm_theta); + } } } //! Kernel instantiations // clang-format off -template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); -template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); -template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); -template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); -template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); -template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); -template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); -template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); +template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams); // clang-format on diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index 888cac5873..2e28fb4aa2 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -81,15 +81,14 @@ namespace test bool pmeSupportsInputForMode(const gmx_hw_info_t& hwinfo, const t_inputrec* inputRec, CodePath mode) { - bool implemented; - gmx_mtop_t mtop; + bool implemented; switch (mode) { case CodePath::CPU: implemented = true; break; case CodePath::GPU: implemented = (pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(hwinfo, nullptr) - && pme_gpu_supports_input(*inputRec, mtop, nullptr)); + && pme_gpu_supports_input(*inputRec, nullptr)); break; default: GMX_THROW(InternalError("Test not implemented for this mode")); @@ -202,7 +201,7 @@ void pmeInitAtoms(gmx_pme_t* pme, atc = &(pme->atc[0]); atc->x = coordinates; atc->coefficient = charges; - gmx_pme_reinit_atoms(pme, atomCount, charges.data()); + gmx_pme_reinit_atoms(pme, atomCount, charges.data(), nullptr); /* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */ break; @@ -211,7 +210,7 @@ void pmeInitAtoms(gmx_pme_t* pme, atc = &(pme->atc[0]); // We need to set atc->n for passing the size in the tests atc->setNumAtoms(atomCount); - gmx_pme_reinit_atoms(pme, atomCount, charges.data()); + gmx_pme_reinit_atoms(pme, atomCount, charges.data(), nullptr); stateGpu->reinit(atomCount, atomCount); stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()), @@ -314,28 +313,35 @@ void pmePerformSplineAndSpread(gmx_pme_t* pme, PmeAtomComm* atc = &(pme->atc[0]); const size_t gridIndex = 0; const bool computeSplinesForZeroCharges = true; - real* fftgrid = spreadCharges ? pme->fftgrid[gridIndex] : nullptr; + real** fftgrid = spreadCharges ? pme->fftgrid : nullptr; real* pmegrid = pme->pmegrid[gridIndex].grid.grid; switch (mode) { case CodePath::CPU: spread_on_grid(pme, atc, &pme->pmegrid[gridIndex], computeSplines, spreadCharges, - fftgrid, computeSplinesForZeroCharges, gridIndex); + fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, + computeSplinesForZeroCharges, gridIndex); if (spreadCharges && !pme->bUseThreads) { wrap_periodic_pmegrid(pme, pmegrid); - copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex); + copy_pmegrid_to_fftgrid( + pme, pmegrid, fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, gridIndex); } break; +/* The compiler will complain about passing fftgrid (converting double ** to float **) if using + * double precision. GPUs are not used with double precision anyhow. */ +#if !GMX_DOUBLE case CodePath::GPU: { + const real lambdaQ = 1.0; // no synchronization needed as x is transferred in the PME stream GpuEventSynchronizer* xReadyOnDevice = nullptr; - pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges); + pme_gpu_spread(pme->gpu, xReadyOnDevice, fftgrid, computeSplines, spreadCharges, lambdaQ); } break; +#endif default: GMX_THROW(InternalError("Test not implemented for this mode")); } @@ -374,6 +380,7 @@ void pmePerformSolve(const gmx_pme_t* pme, t_complex* h_grid = pmeGetComplexGridInternal(pme); const bool useLorentzBerthelot = false; const size_t threadIndex = 0; + const size_t gridIndex = 0; switch (mode) { case CodePath::CPU: @@ -400,7 +407,7 @@ void pmePerformSolve(const gmx_pme_t* pme, switch (method) { case PmeSolveAlgorithm::Coulomb: - pme_gpu_solve(pme->gpu, h_grid, gridOrdering, computeEnergyAndVirial); + pme_gpu_solve(pme->gpu, gridIndex, h_grid, gridOrdering, computeEnergyAndVirial); break; default: GMX_THROW(InternalError("Test not implemented for this mode")); @@ -421,7 +428,7 @@ void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces) const size_t threadIndex = 0; const size_t gridIndex = 0; real* pmegrid = pme->pmegrid[gridIndex].grid.grid; - real* fftgrid = pme->fftgrid[gridIndex]; + real** fftgrid = pme->fftgrid; switch (mode) { @@ -432,22 +439,27 @@ void pmePerformGather(gmx_pme_t* pme, CodePath mode, ForcesVector& forces) // something which is normally done in serial spline computation (make_thread_local_ind()) atc->spline[threadIndex].n = atomCount; } - copy_fftgrid_to_pmegrid(pme, fftgrid, pmegrid, gridIndex, pme->nthread, threadIndex); + copy_fftgrid_to_pmegrid(pme, fftgrid[gridIndex], pmegrid, gridIndex, pme->nthread, threadIndex); unwrap_periodic_pmegrid(pme, pmegrid); gather_f_bsplines(pme, pmegrid, true, atc, &atc->spline[threadIndex], scale); break; +/* The compiler will complain about passing fftgrid (converting double ** to float **) if using + * double precision. GPUs are not used with double precision anyhow. */ +#if !GMX_DOUBLE case CodePath::GPU: { // Variable initialization needs a non-switch scope const bool computeEnergyAndVirial = false; - PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial); + const real lambdaQ = 1.0; + PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, lambdaQ); GMX_ASSERT(forces.size() == output.forces_.size(), "Size of force buffers did not match"); - pme_gpu_gather(pme->gpu, reinterpret_cast(fftgrid)); + pme_gpu_gather(pme->gpu, fftgrid, lambdaQ); std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces)); } break; +#endif default: GMX_THROW(InternalError("Test not implemented for this mode")); } @@ -843,7 +855,8 @@ SparseComplexGridValuesOutput pmeGetComplexGrid(const gmx_pme_t* pme, CodePath m //! Getting the reciprocal energy and virial PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, PmeSolveAlgorithm method) { - PmeOutput output; + PmeOutput output; + const real lambdaQ = 1.0; switch (mode) { case CodePath::CPU: @@ -863,7 +876,9 @@ PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, P case CodePath::GPU: switch (method) { - case PmeSolveAlgorithm::Coulomb: pme_gpu_getEnergyAndVirial(*pme, &output); break; + case PmeSolveAlgorithm::Coulomb: + pme_gpu_getEnergyAndVirial(*pme, lambdaQ, &output); + break; default: GMX_THROW(InternalError("Test not implemented for this mode")); } diff --git a/src/gromacs/mdlib/mdatoms.cpp b/src/gromacs/mdlib/mdatoms.cpp index f40b2d24af..0d1abd8565 100644 --- a/src/gromacs/mdlib/mdatoms.cpp +++ b/src/gromacs/mdlib/mdatoms.cpp @@ -75,8 +75,9 @@ MDAtoms::~MDAtoms() gmx::AlignedAllocationPolicy::free(mdatoms_->invmass); sfree(mdatoms_->invMassPerDim); sfree(mdatoms_->typeA); - sfree(mdatoms_->chargeB); sfree(mdatoms_->typeB); + /* mdatoms->chargeA and mdatoms->chargeB point at chargeA_.data() + * and chargeB_.data() respectively. They get freed automatically. */ sfree(mdatoms_->sqrt_c6A); sfree(mdatoms_->sigmaA); sfree(mdatoms_->sigma3A); @@ -95,18 +96,30 @@ MDAtoms::~MDAtoms() sfree(mdatoms_->cU2); } -void MDAtoms::resize(int newSize) +void MDAtoms::resizeChargeA(const int newSize) { chargeA_.resizeWithPadding(newSize); mdatoms_->chargeA = chargeA_.data(); } -void MDAtoms::reserve(int newCapacity) +void MDAtoms::resizeChargeB(const int newSize) +{ + chargeB_.resizeWithPadding(newSize); + mdatoms_->chargeB = chargeB_.data(); +} + +void MDAtoms::reserveChargeA(const int newCapacity) { chargeA_.reserveWithPadding(newCapacity); mdatoms_->chargeA = chargeA_.data(); } +void MDAtoms::reserveChargeB(const int newCapacity) +{ + chargeB_.reserveWithPadding(newCapacity); + mdatoms_->chargeB = chargeB_.data(); +} + std::unique_ptr makeMDAtoms(FILE* fp, const gmx_mtop_t& mtop, const t_inputrec& ir, const bool rankHasPmeGpuTask) { auto mdAtoms = std::make_unique(); @@ -114,6 +127,7 @@ std::unique_ptr makeMDAtoms(FILE* fp, const gmx_mtop_t& mtop, const t_i if (rankHasPmeGpuTask) { changePinningPolicy(&mdAtoms->chargeA_, pme_get_pinning_policy()); + changePinningPolicy(&mdAtoms->chargeB_, pme_get_pinning_policy()); } t_mdatoms* md; snew(md, 1); @@ -242,12 +256,16 @@ void atoms2md(const gmx_mtop_t* mtop, // everything, but for now the semantics of md->nalloc being // the capacity are preserved by keeping vectors within // mdAtoms having the same properties as the other arrays. - mdAtoms->reserve(md->nalloc); - mdAtoms->resize(md->nr); + mdAtoms->reserveChargeA(md->nalloc); + mdAtoms->resizeChargeA(md->nr); + if (md->nPerturbed > 0) + { + mdAtoms->reserveChargeB(md->nalloc); + mdAtoms->resizeChargeB(md->nr); + } srenew(md->typeA, md->nalloc); if (md->nPerturbed) { - srenew(md->chargeB, md->nalloc); srenew(md->typeB, md->nalloc); } if (bLJPME) diff --git a/src/gromacs/mdlib/mdatoms.h b/src/gromacs/mdlib/mdatoms.h index c9895b9916..44dbcbba23 100644 --- a/src/gromacs/mdlib/mdatoms.h +++ b/src/gromacs/mdlib/mdatoms.h @@ -69,6 +69,8 @@ class MDAtoms unique_cptr mdatoms_; //! Memory for chargeA that can be set up for efficient GPU transfer. gmx::PaddedHostVector chargeA_; + //! Memory for chargeB that can be set up for efficient GPU transfer. + gmx::PaddedHostVector chargeB_; public: // TODO make this private @@ -78,16 +80,26 @@ public: t_mdatoms* mdatoms() { return mdatoms_.get(); } //! Const getter. const t_mdatoms* mdatoms() const { return mdatoms_.get(); } - /*! \brief Resizes memory. + /*! \brief Resizes memory for charges of FEP state A. * * \throws std::bad_alloc If out of memory. */ - void resize(int newSize); - /*! \brief Reserves memory. + void resizeChargeA(int newSize); + /*! \brief Resizes memory for charges of FEP state B. * * \throws std::bad_alloc If out of memory. */ - void reserve(int newCapacity); + void resizeChargeB(int newSize); + /*! \brief Reserves memory for charges of FEP state A. + * + * \throws std::bad_alloc If out of memory. + */ + void reserveChargeA(int newCapacity); + /*! \brief Reserves memory for charges of FEP state B. + * + * \throws std::bad_alloc If out of memory. + */ + void reserveChargeB(int newCapacity); //! Builder function. friend std::unique_ptr makeMDAtoms(FILE* fp, const gmx_mtop_t& mtop, const t_inputrec& ir, bool rankHasPmeGpuTask); diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 5159f2438d..611ca913dc 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -673,17 +673,19 @@ static void computeSpecialForces(FILE* fplog, * \param[in] pmedata The PME structure * \param[in] box The box matrix * \param[in] stepWork Step schedule flags - * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in - * the device memory. \param[in] wcycle The wallcycle structure + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory. + * \param[in] lambdaQ The Coulomb lambda of the current state. + * \param[in] wcycle The wallcycle structure */ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata, const matrix box, const StepWorkload& stepWork, GpuEventSynchronizer* xReadyOnDevice, + const real lambdaQ, gmx_wallcycle_t wcycle) { pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork); - pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle); + pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ); } /*! \brief Launch the FFT and gather stages of PME GPU @@ -691,13 +693,17 @@ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata, * This function only implements setting the output forces (no accumulation). * * \param[in] pmedata The PME structure + * \param[in] lambdaQ The Coulomb lambda of the current system state. * \param[in] wcycle The wallcycle structure * \param[in] stepWork Step schedule flags */ -static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle, const gmx::StepWorkload& stepWork) +static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, + const real lambdaQ, + gmx_wallcycle_t wcycle, + const gmx::StepWorkload& stepWork) { pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork); - pme_gpu_launch_gather(pmedata, wcycle); + pme_gpu_launch_gather(pmedata, wcycle, lambdaQ); } /*! \brief @@ -713,6 +719,7 @@ static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle, * \param[in,out] pmedata PME module data * \param[in,out] forceOutputs Output buffer for the forces and virial * \param[in,out] enerd Energy data structure results are reduced into + * \param[in] lambdaQ The Coulomb lambda of the current system state. * \param[in] stepWork Step schedule flags * \param[in] wcycle The wallcycle structure */ @@ -720,6 +727,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv, gmx_pme_t* pmedata, gmx::ForceOutputs* forceOutputs, gmx_enerdata_t* enerd, + const real lambdaQ, const StepWorkload& stepWork, gmx_wallcycle_t wcycle) { @@ -739,7 +747,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv, GpuTaskCompletion completionType = (isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check; isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle, &forceWithVirial, - enerd, completionType); + enerd, lambdaQ, completionType); } if (!isNbGpuDone) @@ -1173,7 +1181,7 @@ void do_force(FILE* fplog, if (useGpuPmeOnThisRank) { - launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, wcycle); + launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle); } /* do gridding for pair search */ @@ -1330,7 +1338,7 @@ void do_force(FILE* fplog, // X copy/transform to allow overlap as well as after the GPU NB // launch to avoid FFT launch overhead hijacking the CPU and delaying // the nonbonded kernel. - launchPmeGpuFftAndGather(fr->pmedata, wcycle, stepWork); + launchPmeGpuFftAndGather(fr->pmedata, lambda[efptCOUL], wcycle, stepWork); } /* Communicate coordinates and sum dipole if necessary + @@ -1685,12 +1693,14 @@ void do_force(FILE* fplog, && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps); if (alternateGpuWait) { - alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, wcycle); + alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, lambda[efptCOUL], + stepWork, wcycle); } if (!alternateGpuWait && useGpuPmeOnThisRank) { - pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd); + pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd, + lambda[efptCOUL]); } /* Wait for local GPU NB outputs on the non-alternating wait path */ diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index c9315bfc5c..b90bc9aea5 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -812,7 +812,7 @@ int Mdrunner::mdrunner() hw_opt.nthreads_tmpi); useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi( useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, *hwinfo, - *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks); + *inputrec, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks); } GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR @@ -878,7 +878,7 @@ int Mdrunner::mdrunner() nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded, gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected); useGpuForPme = decideWhetherToUseGpusForPme( - useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec, mtop, + useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec, cr->sizeOfDefaultCommunicator, domdecOptions.numPmeRanks, gpusWereDetected); auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr) && inputSupportsGpuBondeds(*inputrec, mtop, nullptr); diff --git a/src/gromacs/mdrun/tpi.cpp b/src/gromacs/mdrun/tpi.cpp index 78305b2d10..3e780cade8 100644 --- a/src/gromacs/mdrun/tpi.cpp +++ b/src/gromacs/mdrun/tpi.cpp @@ -320,7 +320,7 @@ void LegacySimulator::do_tpi() if (EEL_PME(fr->ic->eeltype)) { - gmx_pme_reinit_atoms(fr->pmedata, a_tp0, nullptr); + gmx_pme_reinit_atoms(fr->pmedata, a_tp0, nullptr, nullptr); } /* With reacion-field we have distance dependent potentials diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp index 2f9d2baa85..e8a9e697f6 100644 --- a/src/gromacs/taskassignment/decidegpuusage.cpp +++ b/src/gromacs/taskassignment/decidegpuusage.cpp @@ -154,14 +154,12 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(const bool useGpuFor const std::vector& userGpuTaskAssignment, const gmx_hw_info_t& hardwareInfo, const t_inputrec& inputrec, - const gmx_mtop_t& mtop, const int numRanksPerSimulation, const int numPmeRanksPerSimulation) { // First, exclude all cases where we can't run PME on GPUs. if ((pmeTarget == TaskTarget::Cpu) || !useGpuForNonbonded || !pme_gpu_supports_build(nullptr) - || !pme_gpu_supports_hardware(hardwareInfo, nullptr) - || !pme_gpu_supports_input(inputrec, mtop, nullptr)) + || !pme_gpu_supports_hardware(hardwareInfo, nullptr) || !pme_gpu_supports_input(inputrec, nullptr)) { // PME can't run on a GPU. If the user required that, we issue // an error later. @@ -328,7 +326,6 @@ bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, const std::vector& userGpuTaskAssignment, const gmx_hw_info_t& hardwareInfo, const t_inputrec& inputrec, - const gmx_mtop_t& mtop, const int numRanksPerSimulation, const int numPmeRanksPerSimulation, const bool gpusWereDetected) @@ -365,7 +362,7 @@ bool decideWhetherToUseGpusForPme(const bool useGpuForNonbonded, } return false; } - if (!pme_gpu_supports_input(inputrec, mtop, &message)) + if (!pme_gpu_supports_input(inputrec, &message)) { if (pmeTarget == TaskTarget::Gpu) { diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h index bfb002547a..5d15244416 100644 --- a/src/gromacs/taskassignment/decidegpuusage.h +++ b/src/gromacs/taskassignment/decidegpuusage.h @@ -136,7 +136,6 @@ bool decideWhetherToUseGpusForNonbondedWithThreadMpi(TaskTarget non * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. * \param[in] hardwareInfo Hardware information * \param[in] inputrec The user input - * \param[in] mtop Global system topology * \param[in] numRanksPerSimulation The number of ranks in each simulation. * \param[in] numPmeRanksPerSimulation The number of PME ranks in each simulation. * @@ -150,7 +149,6 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(bool useGpuFor const std::vector& userGpuTaskAssignment, const gmx_hw_info_t& hardwareInfo, const t_inputrec& inputrec, - const gmx_mtop_t& mtop, int numRanksPerSimulation, int numPmeRanksPerSimulation); @@ -209,7 +207,6 @@ bool decideWhetherToUseGpusForNonbonded(TaskTarget nonbondedTarget, * \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs. * \param[in] hardwareInfo Hardware information * \param[in] inputrec The user input - * \param[in] mtop Global system topology * \param[in] numRanksPerSimulation The number of ranks in each simulation. * \param[in] numPmeRanksPerSimulation The number of PME ranks in each simulation. * \param[in] gpusWereDetected Whether compatible GPUs were detected on any node. @@ -223,7 +220,6 @@ bool decideWhetherToUseGpusForPme(bool useGpuForNonbonded, const std::vector& userGpuTaskAssignment, const gmx_hw_info_t& hardwareInfo, const t_inputrec& inputrec, - const gmx_mtop_t& mtop, int numRanksPerSimulation, int numPmeRanksPerSimulation, bool gpusWereDetected); diff --git a/src/gromacs/taskassignment/resourcedivision.cpp b/src/gromacs/taskassignment/resourcedivision.cpp index 6d061c94e5..ac44344003 100644 --- a/src/gromacs/taskassignment/resourcedivision.cpp +++ b/src/gromacs/taskassignment/resourcedivision.cpp @@ -360,7 +360,7 @@ int get_nthreads_mpi(const gmx_hw_info_t* hwinfo, GMX_RELEASE_ASSERT((EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)) && pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(*hwinfo, nullptr) - && pme_gpu_supports_input(*inputrec, *mtop, nullptr), + && pme_gpu_supports_input(*inputrec, nullptr), "PME can't be on GPUs unless we are using PME"); // PME on GPUs supports a single PME rank with PP running on the same or few other ranks. -- 2.22.0