The first patch in a series to enable running Coulomb FEP PME on GPU.
Use two sets of coefficients to store atom charges.
Refs #2054, #3117
Change-Id: Iab6eb7ac766800f7c045dc5a00069e77509d391f
changed from quadratic in the number of restraints to linear.
:issue:`3457`
+
+Support for offloading PME to GPU when doing Coulomb FEP
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+PME calculations can be offloaded to GPU when doing Coulomb free-energy perturbations.
- Only single precision is supported.
-- Free energy calculations where charges are perturbed are not supported,
- because only single PME grids can be calculated.
-
- Only dynamical integrators are supported (ie. leap-frog, Velocity Verlet,
stochastic dynamics)
* For PME-only ranks, gmx_pmeonly() has its own call to gmx_pme_reinit_atoms().
*/
const int numPmeAtoms = numHomeAtoms - fr->n_tpi;
- gmx_pme_reinit_atoms(fr->pmedata, numPmeAtoms, mdatoms->chargeA);
+ gmx_pme_reinit_atoms(fr->pmedata, numPmeAtoms, mdatoms->chargeA, mdatoms->chargeB);
}
if (constr)
return addMessageIfNotSupported(errorReasons, error);
}
-bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::string* error)
+bool pme_gpu_supports_input(const t_inputrec& ir, std::string* error)
{
std::list<std::string> errorReasons;
if (!EEL_PME(ir.coulombtype))
{
errorReasons.emplace_back("interpolation orders other than 4");
}
- if (ir.efep != efepNO)
- {
- if (gmx_mtop_has_perturbed_charges(mtop))
- {
- errorReasons.emplace_back(
- "free energy calculations with perturbed charges (multiple grids)");
- }
- }
if (EVDW_PME(ir.vdwtype))
{
errorReasons.emplace_back("Lennard-Jones PME");
{
errorReasons.emplace_back("interpolation orders other than 4");
}
- if (pme->bFEP)
- {
- errorReasons.emplace_back("free energy calculations (multiple grids)");
- }
if (pme->doLJ)
{
errorReasons.emplace_back("Lennard-Jones PME");
*/
if (!pme_src->gpu && pme_src->nnodes == 1)
{
- gmx_pme_reinit_atoms(*pmedata, pme_src->atc[0].numAtoms(), nullptr);
+ gmx_pme_reinit_atoms(*pmedata, pme_src->atc[0].numAtoms(), nullptr, nullptr);
}
// TODO this is mostly passing around current values
}
delete pme;
}
-void gmx_pme_reinit_atoms(gmx_pme_t* pme, const int numAtoms, const real* charges)
+void gmx_pme_reinit_atoms(gmx_pme_t* pme, const int numAtoms, const real* chargesA, const real* chargesB)
{
if (pme->gpu != nullptr)
{
- pme_gpu_reinit_atoms(pme->gpu, numAtoms, charges);
+ GMX_ASSERT(!(pme->bFEP_q && chargesB == nullptr),
+ "B state charges must be specified if running Coulomb FEP on the GPU");
+ pme_gpu_reinit_atoms(pme->gpu, numAtoms, chargesA, pme->bFEP_q ? chargesB : nullptr);
}
else
{
*
* \param[in,out] pme The PME structure.
* \param[in] numAtoms The number of particles.
- * \param[in] charges The pointer to the array of particle charges.
+ * \param[in] chargesA The pointer to the array of particle charges in the normal state or FEP
+ * state A. Can be nullptr if PME is not performed on the GPU.
+ * \param[in] chargesB The pointer to the array of particle charges in state B. Only used if
+ * charges are perturbed and can otherwise be nullptr.
*/
-void gmx_pme_reinit_atoms(gmx_pme_t* pme, int numAtoms, const real* charges);
+void gmx_pme_reinit_atoms(gmx_pme_t* pme, int numAtoms, const real* chargesA, const real* chargesB);
/* A block of PME GPU functions */
* formed gmx_pme_t structure. Should that one go away/work with inputrec?
*
* \param[in] ir Input system.
- * \param[in] mtop Complete system topology to check if an FE simulation perturbs charges.
* \param[out] error If non-null, the error message if the input is not supported on GPU.
*
* \returns true if PME can run on GPU with this input, false otherwise.
*/
-bool pme_gpu_supports_input(const t_inputrec& ir, const gmx_mtop_t& mtop, std::string* error);
+bool pme_gpu_supports_input(const t_inputrec& ir, std::string* error);
/*! \brief
* Returns the active PME codepath (CPU, GPU, mixed).
* Launches first stage of PME on GPU - spreading kernel.
*
* \param[in] pme The PME data structure.
- * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory; nullptr allowed only on separate PME ranks.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates
+ * are ready in the device memory; nullptr allowed only on separate PME ranks.
* \param[in] wcycle The wallclock counter.
+ * \param[in] lambdaQ The Coulomb lambda of the current state of the
+ * system. Only used if FEP of Coulomb is active.
*/
GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
- gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+ gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle),
+ const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
/*! \brief
* Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
/*! \brief
* Launches last stage of PME on GPU - force gathering and D2H force transfer.
*
- * \param[in] pme The PME data structure.
- * \param[in] wcycle The wallclock counter.
+ * \param[in] pme The PME data structure.
+ * \param[in] wcycle The wallclock counter.
+ * \param[in] lambdaQ The Coulomb lambda to use when calculating the results.
*/
GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
- gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
+ gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle),
+ const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
/*! \brief
* Attempts to complete PME GPU tasks.
* \param[in] wcycle The wallclock counter.
* \param[out] forceWithVirial The output force and virial
* \param[out] enerd The output energies
+ * \param[in] lambdaQ The Coulomb lambda to use when calculating the results.
* \param[in] completionKind Indicates whether PME task completion should only be checked rather
* than waited for
* \returns True if the PME GPU tasks have completed
gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle),
gmx::ForceWithVirial* GPU_FUNC_ARGUMENT(forceWithVirial),
gmx_enerdata_t* GPU_FUNC_ARGUMENT(enerd),
+ const real GPU_FUNC_ARGUMENT(lambdaQ),
GpuTaskCompletion GPU_FUNC_ARGUMENT(completionKind))
GPU_FUNC_TERM_WITH_RETURN(false);
* \param[in] wcycle The wallclock counter.
* \param[out] forceWithVirial The output force and virial
* \param[out] enerd The output energies
+ * \param[in] lambdaQ The Coulomb lambda to use when calculating the results.
*/
GPU_FUNC_QUALIFIER void pme_gpu_wait_and_reduce(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
const gmx::StepWorkload& GPU_FUNC_ARGUMENT(stepWork),
gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle),
gmx::ForceWithVirial* GPU_FUNC_ARGUMENT(forceWithVirial),
- gmx_enerdata_t* GPU_FUNC_ARGUMENT(enerd)) GPU_FUNC_TERM;
+ gmx_enerdata_t* GPU_FUNC_ARGUMENT(enerd),
+ const real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
/*! \brief
* The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
}
}
+/*! \brief Calculate the sum of the force partial components (in X, Y and Z)
+ *
+ * \param[out] fx The force partial component in the X dimension.
+ * \param[out] fy The force partial component in the Y dimension.
+ * \param[out] fz The force partial component in the Z dimension.
+ * \param[in] ixBase The grid line index base value in the X dimension.
+ * \param[in] nx The grid real size in the X dimension.
+ * \param[in] pny The padded grid real size in the Y dimension.
+ * \param[in] pnz The padded grid real size in the Z dimension.
+ * \param[in] constOffset The offset to calculate the global grid index.
+ * \param[in] splineIndexBase The base value of the spline parameter index.
+ * \param[in] tdy The theta and dtheta in the Y dimension.
+ * \param[in] tdz The theta and dtheta in the Z dimension.
+ * \param[in] sm_splineParams Shared memory array of spline parameters.
+ * \param[in] gm_grid Global memory array of the grid to use.
+ */
+inline void sumForceComponents(float* fx,
+ float* fy,
+ float* fz,
+ const int ixBase,
+ const int nx,
+ const int pny,
+ const int pnz,
+ const int constOffset,
+ const int splineIndexBase,
+ const float2 tdy,
+ const float2 tdz,
+ __local const float2* __restrict__ sm_splineParams,
+ __global const float* __restrict__ gm_grid)
+{
+# pragma unroll order
+ for (int ithx = 0; (ithx < order); ithx++)
+ {
+ int ix = ixBase + ithx;
+ if (wrapX & (ix >= nx))
+ {
+ ix -= nx;
+ }
+ const int gridIndexGlobal = ix * pny * pnz + constOffset;
+ assert(gridIndexGlobal >= 0);
+ const float gridValue = gm_grid[gridIndexGlobal];
+ assert(isfinite(gridValue));
+ const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
+ const float2 tdx = sm_splineParams[splineIndexX];
+ const float fxy1 = tdz.x * gridValue;
+ const float fz1 = tdz.y * gridValue;
+ *fx += tdx.y * tdy.x * fxy1;
+ *fy += tdx.x * tdy.y * fxy1;
+ *fz += tdx.x * tdy.x * fz1;
+ }
+}
+
+/*! \brief Calculate the grid forces and store them in shared memory.
+ *
+ * \param[in,out] sm_forces Shared memory array with the output forces.
+ * \param[in] forceIndexLocal The local (per thread) index in the sm_forces array.
+ * \param[in] forceIndexGlobal The index of the thread in the gm_coefficients array.
+ * \param[in] recipBox The reciprocal box.
+ * \param[in] scale The scale to use when calculating the forces (only relevant when
+ * using two grids).
+ * \param[in] gm_coefficients Global memory array of the coefficients to use.
+ */
+inline void calculateAndStoreGridForces(__local float* __restrict__ sm_forces,
+ const int forceIndexLocal,
+ const int forceIndexGlobal,
+ const float recipBox[DIM][DIM],
+ const float scale,
+ __global const float* __restrict__ gm_coefficients)
+{
+ const float3 atomForces = vload3(forceIndexLocal, sm_forces);
+ float negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
+ float3 result;
+ result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
+ result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
+ result.z = negCoefficient
+ * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
+ + recipBox[ZZ][ZZ] * atomForces.z);
+ vstore3(result, forceIndexLocal, sm_forces);
+}
+
#endif // COMPILE_GATHER_HELPERS_ONCE
/*! \brief
* Please see the file description for additional defines which this kernel expects.
*
* \param[in] kernelParams All the PME GPU data.
- * \param[in] gm_coefficients Atom charges/coefficients.
- * \param[in] gm_grid Global 3D grid.
+ * \param[in] gm_coefficientsA Atom charges/coefficients in the unperturbed state, or FEP
+ * state A.
+ * \param[in] gm_coefficientsB Atom charges/coefficients in FEP state B. Only used
+ * when spreading interpolated coefficients on one grid or spreading two sets of coefficients on two
+ * separate grids.
+ * \param[in] gm_gridA Global 3D grid for the unperturbed state, FEP
+ * state A or the single grid used for interpolated coefficients on one grid in FEP A/B.
+ * \param[in] gm_gridB Global 3D grid for FEP state B when using dual
+ * grids (when calculating energy and virials).
* \param[in] gm_theta Atom spline parameter values
* \param[in] gm_dtheta Atom spline parameter derivatives
* \param[in] gm_gridlineIndices Atom gridline indices (ivec)
*/
__attribute__((reqd_work_group_size(order, order, atomsPerBlock)))
__kernel void CUSTOMIZED_KERNEL_NAME(pme_gather_kernel)(const struct PmeOpenCLKernelParams kernelParams,
- __global const float* __restrict__ gm_coefficients,
- __global const float* __restrict__ gm_grid,
+ __global const float* __restrict__ gm_coefficientsA,
+ __global const float* __restrict__ gm_coefficientsB,
+ __global const float* __restrict__ gm_gridA,
+ __global const float* __restrict__ gm_gridB,
__global const float* __restrict__ gm_theta,
__global const float* __restrict__ gm_dtheta,
__global const int* __restrict__ gm_gridlineIndices,
__global float* __restrict__ gm_forces)
{
+ assert(numGrids == 1 || numGrids == 2);
+
/* These are the atom indices - for the shared and global memory */
const int atomIndexLocal = get_local_id(ZZ);
const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
float fy = 0.0F;
float fz = 0.0F;
- const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
+ int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
- if (chargeCheck)
+ const int nx = kernelParams.grid.realGridSize[XX];
+ const int ny = kernelParams.grid.realGridSize[YY];
+ const int nz = kernelParams.grid.realGridSize[ZZ];
+ const int pny = kernelParams.grid.realGridSizePadded[YY];
+ const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
+
+ const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
+ const int warpIndex = atomIndexLocal / atomsPerWarp;
+
+ const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
+ const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy);
+ const float2 tdy = sm_splineParams[splineIndexY];
+ const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz);
+ const float2 tdz = sm_splineParams[splineIndexZ];
+
+ const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
+ int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+ if (wrapY & (iy >= ny))
{
- const int nx = kernelParams.grid.realGridSize[XX];
- const int ny = kernelParams.grid.realGridSize[YY];
- const int nz = kernelParams.grid.realGridSize[ZZ];
- const int pny = kernelParams.grid.realGridSizePadded[YY];
- const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
-
- const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
- const int warpIndex = atomIndexLocal / atomsPerWarp;
-
- const int splineIndexBase = getSplineParamIndexBase(warpIndex, atomWarpIndex);
- const int splineIndexY = getSplineParamIndex(splineIndexBase, YY, ithy);
- const float2 tdy = sm_splineParams[splineIndexY];
- const int splineIndexZ = getSplineParamIndex(splineIndexBase, ZZ, ithz);
- const float2 tdz = sm_splineParams[splineIndexZ];
-
- const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
- int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
- if (wrapY & (iy >= ny))
- {
- iy -= ny;
- }
- int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
- if (iz >= nz)
- {
- iz -= nz;
- }
- const int constOffset = iy * pnz + iz;
+ iy -= ny;
+ }
+ int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+ if (iz >= nz)
+ {
+ iz -= nz;
+ }
+ const int constOffset = iy * pnz + iz;
-#pragma unroll order
- for (int ithx = 0; (ithx < order); ithx++)
- {
- int ix = ixBase + ithx;
- if (wrapX & (ix >= nx))
- {
- ix -= nx;
- }
- const int gridIndexGlobal = ix * pny * pnz + constOffset;
- assert(gridIndexGlobal >= 0);
- const float gridValue = gm_grid[gridIndexGlobal];
- assert(isfinite(gridValue));
- const int splineIndexX = getSplineParamIndex(splineIndexBase, XX, ithx);
- const float2 tdx = sm_splineParams[splineIndexX];
- const float fxy1 = tdz.x * gridValue;
- const float fz1 = tdz.y * gridValue;
- fx += tdx.y * tdy.x * fxy1;
- fy += tdx.x * tdy.y * fxy1;
- fz += tdx.x * tdy.x * fz1;
- }
+ if (chargeCheck)
+ {
+ sumForceComponents(&fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase, tdy,
+ tdz, sm_splineParams, gm_gridA);
}
// Reduction of partial force contributions
barrier(CLK_LOCAL_MEM_FENCE);
/* Calculating the final forces with no component branching, atomsPerBlock threads */
- const int forceIndexLocal = threadLocalId;
- const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+ const int forceIndexLocal = threadLocalId;
+ const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+ const float scale = kernelParams.current.scale;
if (forceIndexLocal < atomsPerBlock)
{
- const float3 atomForces = vload3(forceIndexLocal, sm_forces);
- const float negCoefficient = -gm_coefficients[forceIndexGlobal];
- float3 result;
- result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
- result.y = negCoefficient
- * (kernelParams.current.recipBox[XX][YY] * atomForces.x
- + kernelParams.current.recipBox[YY][YY] * atomForces.y);
- result.z = negCoefficient
- * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x
- + kernelParams.current.recipBox[YY][ZZ] * atomForces.y
- + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
- vstore3(result, forceIndexLocal, sm_forces);
+ calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal,
+ kernelParams.current.recipBox, scale, gm_coefficientsA);
}
#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
gm_forces[outputIndexGlobal] = outputForceComponent;
}
}
+
+ if (numGrids == 2)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ fx = 0.0f;
+ fy = 0.0f;
+ fz = 0.0f;
+ chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
+ if (chargeCheck)
+ {
+ sumForceComponents(&fx, &fy, &fz, ixBase, nx, pny, pnz, constOffset, splineIndexBase,
+ tdy, tdz, sm_splineParams, gm_gridB);
+ }
+ reduce_atom_forces(sm_forces, atomIndexLocal, splineIndex, lineIndex,
+ kernelParams.grid.realGridSizeFP, fx, fy, fz, sm_forceReduction, sm_forceTemp);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (forceIndexLocal < atomsPerBlock)
+ {
+ calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal,
+ kernelParams.current.recipBox, 1.0F - scale, gm_coefficientsB);
+ }
+
+#if !defined(_AMD_SOURCE_) && !defined(_NVIDIA_SOURCE_)
+ /* This is only here for execution of e.g. 32-sized warps on 16-wide hardware; this was
+ * __syncwarp() in CUDA. #2519
+ */
+ barrier(CLK_LOCAL_MEM_FENCE);
+#endif
+
+ /* Writing or adding the final forces component-wise, single warp */
+ if (threadLocalId < iterThreads)
+ {
+#pragma unroll
+ for (int i = 0; i < numIter; i++)
+ {
+ const int outputIndexLocal = i * iterThreads + threadLocalId;
+ const int outputIndexGlobal = get_group_id(XX) * blockForcesSize + outputIndexLocal;
+ const float outputForceComponent = sm_forces[outputIndexLocal];
+ gm_forces[outputIndexGlobal] += outputForceComponent;
+ }
+ }
+ }
}
}
}
+/*! \brief Calculate the sum of the force partial components (in X, Y and Z)
+ *
+ * \tparam[in] order The PME order (must be 4).
+ * \tparam[in] atomsPerWarp The number of atoms per GPU warp.
+ * \tparam[in] wrapX Tells if the grid is wrapped in the X dimension.
+ * \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension.
+ * \param[out] fx The force partial component in the X dimension.
+ * \param[out] fy The force partial component in the Y dimension.
+ * \param[out] fz The force partial component in the Z dimension.
+ * \param[in] ithyMin The thread minimum index in the Y dimension.
+ * \param[in] ithyMax The thread maximum index in the Y dimension.
+ * \param[in] ixBase The grid line index base value in the X dimension.
+ * \param[in] iz The grid line index in the Z dimension.
+ * \param[in] nx The grid real size in the X dimension.
+ * \param[in] ny The grid real size in the Y dimension.
+ * \param[in] pny The padded grid real size in the Y dimension.
+ * \param[in] pnz The padded grid real size in the Z dimension.
+ * \param[in] atomIndexLocal The atom index for this thread.
+ * \param[in] splineIndexBase The base value of the spline parameter index.
+ * \param[in] tdz The theta and dtheta in the Z dimension.
+ * \param[in] sm_gridlineIndices Shared memory array of grid line indices.
+ * \param[in] sm_theta Shared memory array of atom theta values.
+ * \param[in] sm_dtheta Shared memory array of atom dtheta values.
+ * \param[in] gm_grid Global memory array of the grid to use.
+ */
+template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
+__device__ __forceinline__ void sumForceComponents(float* __restrict__ fx,
+ float* __restrict__ fy,
+ float* __restrict__ fz,
+ const int ithyMin,
+ const int ithyMax,
+ const int ixBase,
+ const int iz,
+ const int nx,
+ const int ny,
+ const int pny,
+ const int pnz,
+ const int atomIndexLocal,
+ const int splineIndexBase,
+ const float2 tdz,
+ const int* __restrict__ sm_gridlineIndices,
+ const float* __restrict__ sm_theta,
+ const float* __restrict__ sm_dtheta,
+ const float* __restrict__ gm_grid)
+{
+#pragma unroll
+ for (int ithy = ithyMin; ithy < ithyMax; ithy++)
+ {
+ const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+ const float2 tdy = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
+
+ int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+ if (wrapY & (iy >= ny))
+ {
+ iy -= ny;
+ }
+ const int constOffset = iy * pnz + iz;
+
+#pragma unroll
+ for (int ithx = 0; (ithx < order); ithx++)
+ {
+ int ix = ixBase + ithx;
+ if (wrapX & (ix >= nx))
+ {
+ ix -= nx;
+ }
+ const int gridIndexGlobal = ix * pny * pnz + constOffset;
+ assert(gridIndexGlobal >= 0);
+ const float gridValue = gm_grid[gridIndexGlobal];
+ assert(isfinite(gridValue));
+ const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+ const float2 tdx = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
+ const float fxy1 = tdz.x * gridValue;
+ const float fz1 = tdz.y * gridValue;
+ *fx += tdx.y * tdy.x * fxy1;
+ *fy += tdx.x * tdy.y * fxy1;
+ *fz += tdx.x * tdy.x * fz1;
+ }
+ }
+}
+
+/*! \brief Calculate the grid forces and store them in shared memory.
+ *
+ * \param[in,out] sm_forces Shared memory array with the output forces.
+ * \param[in] forceIndexLocal The local (per thread) index in the sm_forces array.
+ * \param[in] forceIndexGlobal The index of the thread in the gm_coefficients array.
+ * \param[in] recipBox The reciprocal box.
+ * \param[in] scale The scale to use when calculating the forces. For gm_coefficientsB
+ * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
+ * \param[in] gm_coefficients Global memory array of the coefficients to use for an unperturbed
+ * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
+ * separate grids are used this should be the coefficients of the grid in question.
+ * \param[in] gm_coefficientsB Global memory array of the coefficients to use for FEP in state B.
+ * Should be nullptr if two separate grids are used.
+ */
+__device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces,
+ const int forceIndexLocal,
+ const int forceIndexGlobal,
+ const float recipBox[DIM][DIM],
+ const float scale,
+ const float* __restrict__ gm_coefficients)
+{
+ const float3 atomForces = sm_forces[forceIndexLocal];
+ float negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
+ float3 result;
+ result.x = negCoefficient * recipBox[XX][XX] * atomForces.x;
+ result.y = negCoefficient * (recipBox[XX][YY] * atomForces.x + recipBox[YY][YY] * atomForces.y);
+ result.z = negCoefficient
+ * (recipBox[XX][ZZ] * atomForces.x + recipBox[YY][ZZ] * atomForces.y
+ + recipBox[ZZ][ZZ] * atomForces.z);
+ sm_forces[forceIndexLocal] = result;
+}
+
/*! \brief
* A CUDA kernel which gathers the atom forces from the grid.
* The grid is assumed to be wrapped in dimension Z.
* \tparam[in] order The PME order (must be 4 currently).
* \tparam[in] wrapX Tells if the grid is wrapped in the X dimension.
* \tparam[in] wrapY Tells if the grid is wrapped in the Y dimension.
+ * \tparam[in] numGrids The number of grids to use in the kernel. Can be 1 or 2.
* \tparam[in] readGlobal Tells if we should read spline values from global memory
* \tparam[in] threadsPerAtom How many threads work on each atom
*
* \param[in] kernelParams All the PME GPU data.
*/
-template<int order, bool wrapX, bool wrapY, bool readGlobal, ThreadsPerAtom threadsPerAtom>
+template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
__launch_bounds__(c_gatherMaxThreadsPerBlock, c_gatherMinBlocksPerMP) __global__
void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams)
{
+ assert(numGrids == 1 || numGrids == 2);
+
/* Global memory pointers */
- const float* __restrict__ gm_coefficients = kernelParams.atoms.d_coefficients;
- const float* __restrict__ gm_grid = kernelParams.grid.d_realGrid;
- float* __restrict__ gm_forces = kernelParams.atoms.d_forces;
+ const float* __restrict__ gm_coefficientsA = kernelParams.atoms.d_coefficients[0];
+ const float* __restrict__ gm_coefficientsB = kernelParams.atoms.d_coefficients[1];
+ const float* __restrict__ gm_gridA = kernelParams.grid.d_realGrid[0];
+ const float* __restrict__ gm_gridB = kernelParams.grid.d_realGrid[1];
+ float* __restrict__ gm_forces = kernelParams.atoms.d_forces;
/* Global memory pointers for readGlobal */
const float* __restrict__ gm_theta = kernelParams.atoms.d_theta;
// Coordinates
__shared__ float3 sm_coordinates[atomsPerBlock];
/* Staging coefficients/charges */
- pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficients);
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, gm_coefficientsA);
/* Staging coordinates */
pme_gpu_stage_atom_data<float3, atomsPerBlock, 1>(sm_coordinates, gm_coordinates);
else
{
atomX = gm_coordinates[atomIndexGlobal];
- atomCharge = gm_coefficients[atomIndexGlobal];
+ atomCharge = gm_coefficientsA[atomIndexGlobal];
}
calculate_splines<order, atomsPerBlock, atomsPerWarp, true, false>(
kernelParams, atomIndexOffset, atomX, atomCharge, sm_theta, sm_dtheta, sm_gridlineIndices);
float fy = 0.0f;
float fz = 0.0f;
- const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficients[atomIndexGlobal]);
-
- if (chargeCheck)
- {
- const int nx = kernelParams.grid.realGridSize[XX];
- const int ny = kernelParams.grid.realGridSize[YY];
- const int nz = kernelParams.grid.realGridSize[ZZ];
- const int pny = kernelParams.grid.realGridSizePadded[YY];
- const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
-
- const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
- const int warpIndex = atomIndexLocal / atomsPerWarp;
+ const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsA[atomIndexGlobal]);
- const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
- const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
- const float2 tdz = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
+ const int nx = kernelParams.grid.realGridSize[XX];
+ const int ny = kernelParams.grid.realGridSize[YY];
+ const int nz = kernelParams.grid.realGridSize[ZZ];
+ const int pny = kernelParams.grid.realGridSizePadded[YY];
+ const int pnz = kernelParams.grid.realGridSizePadded[ZZ];
- int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
- const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
+ const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
+ const int warpIndex = atomIndexLocal / atomsPerWarp;
- if (iz >= nz)
- {
- iz -= nz;
- }
- int constOffset, iy;
+ const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+ const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
+ const float2 tdz = make_float2(sm_theta[splineIndexZ], sm_dtheta[splineIndexZ]);
- const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
- const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
- for (int ithy = ithyMin; ithy < ithyMax; ithy++)
- {
- const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
- const float2 tdy = make_float2(sm_theta[splineIndexY], sm_dtheta[splineIndexY]);
+ int iz = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+ const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
- iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
- if (wrapY & (iy >= ny))
- {
- iy -= ny;
- }
- constOffset = iy * pnz + iz;
-
-#pragma unroll
- for (int ithx = 0; (ithx < order); ithx++)
- {
- int ix = ixBase + ithx;
- if (wrapX & (ix >= nx))
- {
- ix -= nx;
- }
- const int gridIndexGlobal = ix * pny * pnz + constOffset;
- assert(gridIndexGlobal >= 0);
- const float gridValue = gm_grid[gridIndexGlobal];
- assert(isfinite(gridValue));
- const int splineIndexX =
- getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
- const float2 tdx = make_float2(sm_theta[splineIndexX], sm_dtheta[splineIndexX]);
- const float fxy1 = tdz.x * gridValue;
- const float fz1 = tdz.y * gridValue;
- fx += tdx.y * tdy.x * fxy1;
- fy += tdx.x * tdy.y * fxy1;
- fz += tdx.x * tdy.x * fz1;
- }
- }
+ if (iz >= nz)
+ {
+ iz -= nz;
}
+ const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : threadIdx.y;
+ const int ithyMax = (threadsPerAtom == ThreadsPerAtom::Order) ? order : threadIdx.y + 1;
+ if (chargeCheck)
+ {
+ sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(
+ &fx, &fy, &fz, ithyMin, ithyMax, ixBase, iz, nx, ny, pny, pnz, atomIndexLocal,
+ splineIndexBase, tdz, sm_gridlineIndices, sm_theta, sm_dtheta, gm_gridA);
+ }
// Reduction of partial force contributions
__shared__ float3 sm_forces[atomsPerBlock];
reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces, atomIndexLocal, splineIndex, lineIndex,
__syncthreads();
/* Calculating the final forces with no component branching, atomsPerBlock threads */
- const int forceIndexLocal = threadLocalId;
- const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+ const int forceIndexLocal = threadLocalId;
+ const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+ const float scale = kernelParams.current.scale;
if (forceIndexLocal < atomsPerBlock)
{
- const float3 atomForces = sm_forces[forceIndexLocal];
- const float negCoefficient = -gm_coefficients[forceIndexGlobal];
- float3 result;
- result.x = negCoefficient * kernelParams.current.recipBox[XX][XX] * atomForces.x;
- result.y = negCoefficient
- * (kernelParams.current.recipBox[XX][YY] * atomForces.x
- + kernelParams.current.recipBox[YY][YY] * atomForces.y);
- result.z = negCoefficient
- * (kernelParams.current.recipBox[XX][ZZ] * atomForces.x
- + kernelParams.current.recipBox[YY][ZZ] * atomForces.y
- + kernelParams.current.recipBox[ZZ][ZZ] * atomForces.z);
- sm_forces[forceIndexLocal] = result;
+ calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal,
+ kernelParams.current.recipBox, scale, gm_coefficientsA);
}
__syncwarp();
#pragma unroll
for (int i = 0; i < numIter; i++)
{
- int outputIndexLocal = i * iterThreads + threadLocalId;
- int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
- const float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
- gm_forces[outputIndexGlobal] = outputForceComponent;
+ int outputIndexLocal = i * iterThreads + threadLocalId;
+ int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
+ float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+ gm_forces[outputIndexGlobal] = outputForceComponent;
+ }
+ }
+
+ if (numGrids == 2)
+ {
+ /* We must sync here since the same shared memory is used as above. */
+ __syncthreads();
+ fx = 0.0f;
+ fy = 0.0f;
+ fz = 0.0f;
+ const int chargeCheck = pme_gpu_check_atom_charge(gm_coefficientsB[atomIndexGlobal]);
+ if (chargeCheck)
+ {
+ sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(
+ &fx, &fy, &fz, ithyMin, ithyMax, ixBase, iz, nx, ny, pny, pnz, atomIndexLocal,
+ splineIndexBase, tdz, sm_gridlineIndices, sm_theta, sm_dtheta, gm_gridB);
+ }
+ // Reduction of partial force contributions
+ reduce_atom_forces<order, atomDataSize, blockSize>(sm_forces, atomIndexLocal, splineIndex,
+ lineIndex, kernelParams.grid.realGridSizeFP,
+ fx, fy, fz);
+ __syncthreads();
+
+ /* Calculating the final forces with no component branching, atomsPerBlock threads */
+ if (forceIndexLocal < atomsPerBlock)
+ {
+ calculateAndStoreGridForces(sm_forces, forceIndexLocal, forceIndexGlobal,
+ kernelParams.current.recipBox, 1.0F - scale, gm_coefficientsB);
+ }
+
+ __syncwarp();
+
+ /* Writing or adding the final forces component-wise, single warp */
+ if (threadLocalId < iterThreads)
+ {
+#pragma unroll
+ for (int i = 0; i < numIter; i++)
+ {
+ int outputIndexLocal = i * iterThreads + threadLocalId;
+ int outputIndexGlobal = blockIndex * blockForcesSize + outputIndexLocal;
+ float outputForceComponent = ((float*)sm_forces)[outputIndexLocal];
+ gm_forces[outputIndexGlobal] += outputForceComponent;
+ }
}
}
}
//! Kernel instantiations
// clang-format off
-template __global__ void pme_gather_kernel<4, true, true, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, true, true, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, true, true, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-template __global__ void pme_gather_kernel<4, true, true, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-// clang-format on
\ No newline at end of file
+template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_gather_kernel<4, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+// clang-format on
enum gmx_fft_direction dir,
gmx_wallcycle_t wcycle)
{
- GMX_ASSERT(gridIndex == 0, "Only single grid supported");
if (pme_gpu_settings(pme->gpu).performGPUFFT)
{
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
}
}
-void pme_gpu_launch_spread(gmx_pme_t* pme, GpuEventSynchronizer* xReadyOnDevice, gmx_wallcycle* wcycle)
+void pme_gpu_launch_spread(gmx_pme_t* pme,
+ GpuEventSynchronizer* xReadyOnDevice,
+ gmx_wallcycle* wcycle,
+ const real lambdaQ)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
GMX_ASSERT(xReadyOnDevice || !pme->bPPnode || (GMX_GPU != GMX_GPU_CUDA),
"Need a valid xReadyOnDevice on PP+PME ranks with CUDA.");
+ GMX_ASSERT(pme->doCoulomb, "Only Coulomb PME can be run on GPU.");
PmeGpu* pmeGpu = pme->gpu;
- const unsigned int gridIndex = 0;
- real* fftgrid = pme->fftgrid[gridIndex];
+ GMX_ASSERT(pmeGpu->common->ngrids == 1 || (pmeGpu->common->ngrids == 2 && pme->bFEP_q),
+ "If not decoupling Coulomb interactions there should only be one FEP grid. If "
+ "decoupling Coulomb interactions there should be two grids.");
+
+ /* PME on GPU can currently manage two grids:
+ * grid_index=0: Coulomb PME with charges in the normal state or from FEP state A.
+ * grid_index=1: Coulomb PME with charges from FEP state B.
+ */
+ real** fftgrids = pme->fftgrid;
/* Spread the coefficients on a grid */
const bool computeSplines = true;
const bool spreadCharges = true;
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
- pme_gpu_spread(pmeGpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+ pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, const gmx::StepWorkload& stepWork)
{
- PmeGpu* pmeGpu = pme->gpu;
- const auto& settings = pmeGpu->settings;
- const unsigned int gridIndex = 0;
- t_complex* cfftgrid = pme->cfftgrid[gridIndex];
+ PmeGpu* pmeGpu = pme->gpu;
+ const auto& settings = pmeGpu->settings;
// There's no support for computing energy without virial, or vice versa
const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
if (!settings.performGPUFFT)
try
{
- /* do R2C 3D-FFT */
- parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
-
- /* solve in k-space for our local cells */
- if (settings.performGPUSolve)
- {
- // TODO grid ordering should be set up at pme init time.
- const auto gridOrdering = settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ;
- wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
- wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
- pme_gpu_solve(pmeGpu, cfftgrid, gridOrdering, computeEnergyAndVirial);
- wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
- wallcycle_stop(wcycle, ewcLAUNCH_GPU);
- }
- else
+ /* The 3dffts and the solve are done in a loop to simplify things, even if this means that
+ * there will be two kernel launches for solve. */
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
- wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
-#pragma omp parallel for num_threads(pme->nthread) schedule(static)
- for (int thread = 0; thread < pme->nthread; thread++)
+ /* do R2C 3D-FFT */
+ t_complex* cfftgrid = pme->cfftgrid[gridIndex];
+ parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_REAL_TO_COMPLEX, wcycle);
+
+ /* solve in k-space for our local cells */
+ if (settings.performGPUSolve)
{
- solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, pme->nthread, thread);
+ const auto gridOrdering =
+ settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ;
+ wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
+ wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+ pme_gpu_solve(pmeGpu, gridIndex, cfftgrid, gridOrdering, computeEnergyAndVirial);
+ wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
+ wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+ }
+ else
+ {
+ wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+ for (int thread = 0; thread < pme->nthread; thread++)
+ {
+ solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial,
+ pme->nthread, thread);
+ }
+ wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
}
- wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
- }
- parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
+ parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
+ }
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
}
-void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycle)
+void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycle, const real lambdaQ)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
- const unsigned int gridIndex = 0;
- real* fftgrid = pme->fftgrid[gridIndex];
- pme_gpu_gather(pme->gpu, reinterpret_cast<float*>(fftgrid));
+
+ float** fftgrids = pme->fftgrid;
+ pme_gpu_gather(pme->gpu, fftgrids, lambdaQ);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
GMX_ASSERT(enerd, "Invalid energy output manager");
forceWithVirial->addVirialContribution(output.coulombVirial_);
enerd->term[F_COUL_RECIP] += output.coulombEnergy_;
+ enerd->dvdl_lin[efptCOUL] += output.coulombDvdl_;
}
if (output.haveForceOutput_)
{
gmx_wallcycle* wcycle,
gmx::ForceWithVirial* forceWithVirial,
gmx_enerdata_t* enerd,
+ const real lambdaQ,
GpuTaskCompletion completionKind)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
pme_gpu_update_timings(pme->gpu);
// There's no support for computing energy without virial, or vice versa
const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
- PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial);
+ PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial,
+ pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_,
}
// This is used by PME-only ranks
-PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme, const bool computeEnergyAndVirial, gmx_wallcycle* wcycle)
+PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* pme,
+ const bool computeEnergyAndVirial,
+ const real lambdaQ,
+ gmx_wallcycle* wcycle)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
pme_gpu_synchronize(pme->gpu);
}
- PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial);
+ PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial,
+ pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
return output;
}
const gmx::StepWorkload& stepWork,
gmx_wallcycle* wcycle,
gmx::ForceWithVirial* forceWithVirial,
- gmx_enerdata_t* enerd)
+ gmx_enerdata_t* enerd,
+ const real lambdaQ)
{
// There's no support for computing energy without virial, or vice versa
const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
- PmeOutput output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, wcycle);
+ PmeOutput output = pme_gpu_wait_finish_task(
+ pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0, wcycle);
GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_,
"When forces are reduced on the CPU, there needs to be force output");
pme_gpu_reduce_outputs(computeEnergyAndVirial, output, wcycle, forceWithVirial, enerd);
}
}
-GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
+GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex)
{
const PmeGpuCudaKernelParams* kernelParamsPtr = pmeGpu->kernelParams.get();
ivec realGridSize, realGridSizePadded, complexGridSizePadded;
const int realGridSizePaddedTotal =
realGridSizePadded[XX] * realGridSizePadded[YY] * realGridSizePadded[ZZ];
- realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid;
+ realGrid_ = (cufftReal*)kernelParamsPtr->grid.d_realGrid[gridIndex];
GMX_RELEASE_ASSERT(realGrid_, "Bad (null) input real-space grid");
- complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid;
+ complexGrid_ = (cufftComplex*)kernelParamsPtr->grid.d_fourierGrid[gridIndex];
GMX_RELEASE_ASSERT(complexGrid_, "Bad (null) input complex grid");
cufftResult_t result;
* Constructs CUDA/OpenCL FFT plans for performing 3D FFT on a PME grid.
*
* \param[in] pmeGpu The PME GPU structure.
+ * \param[in] gridIndex The index of the grid on which to perform the calculations.
*/
- GpuParallel3dFft(const PmeGpu* pmeGpu);
+ GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex);
/*! \brief Destroys the FFT plans. */
~GpuParallel3dFft();
/*! \brief Performs the FFT transform in given direction
}
}
-GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu)
+GpuParallel3dFft::GpuParallel3dFft(const PmeGpu* pmeGpu, const int gridIndex)
{
// Extracting all the data from PME GPU
std::array<size_t, DIM> realGridSize, realGridSizePadded, complexGridSizePadded;
}
cl_context context = pmeGpu->archSpecific->deviceContext_.context();
deviceStreams_.push_back(pmeGpu->archSpecific->pmeStream_.stream());
- realGrid_ = kernelParamsPtr->grid.d_realGrid;
- complexGrid_ = kernelParamsPtr->grid.d_fourierGrid;
+ realGrid_ = kernelParamsPtr->grid.d_realGrid[gridIndex];
+ complexGrid_ = kernelParamsPtr->grid.d_fourierGrid[gridIndex];
const bool performOutOfPlaceFFT = pmeGpu->archSpecific->performOutOfPlaceFFT;
#include <cassert>
#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
#include "pme.cuh"
#include "pme_grid.h"
/*! \brief
* General purpose function for loading atom-related data from global to shared memory.
*
- * \tparam[in] T Data type (float/int/...)
- * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be
- * accounted for in the size of the shared memory array.
- * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for
- * an rvec coordinates array).
- * \param[out] sm_destination Shared memory array for output.
- * \param[in] gm_source Global memory array for input.
+ * \tparam[in] T Data type (float/int/...)
+ * \tparam[in] atomsPerBlock Number of atoms processed by a block - should be accounted for in
+ * the size of the shared memory array.
+ * \tparam[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec
+ * coordinates array).
+ * \param[out] sm_destination Shared memory array for output.
+ * \param[in] gm_source Global memory array for input.
*/
template<typename T, int atomsPerBlock, int dataCountPerAtom>
__device__ __forceinline__ void pme_gpu_stage_atom_data(T* __restrict__ sm_destination,
void pme_gpu_alloc_energy_virial(PmeGpu* pmeGpu)
{
const size_t energyAndVirialSize = c_virialAndEnergyCount * sizeof(float);
- allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy, c_virialAndEnergyCount,
- pmeGpu->archSpecific->deviceContext_);
- pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy), energyAndVirialSize);
+
+ GMX_ASSERT(
+ pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+ "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ allocateDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex],
+ c_virialAndEnergyCount, pmeGpu->archSpecific->deviceContext_);
+ pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_virialAndEnergy[gridIndex]), energyAndVirialSize);
+ }
}
void pme_gpu_free_energy_virial(PmeGpu* pmeGpu)
{
- freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy);
- pfree(pmeGpu->staging.h_virialAndEnergy);
- pmeGpu->staging.h_virialAndEnergy = nullptr;
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ freeDeviceBuffer(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex]);
+ pfree(pmeGpu->staging.h_virialAndEnergy[gridIndex]);
+ pmeGpu->staging.h_virialAndEnergy[gridIndex] = nullptr;
+ }
}
void pme_gpu_clear_energy_virial(const PmeGpu* pmeGpu)
{
- clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy, 0,
- c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ clearDeviceBufferAsync(&pmeGpu->kernelParams->constants.d_virialAndEnergy[gridIndex], 0,
+ c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_);
+ }
}
-void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu)
+void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, const int gridIndex)
{
+ GMX_ASSERT(
+ pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+ "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+ GMX_ASSERT(gridIndex < pmeGpu->common->ngrids,
+ "Invalid combination of gridIndex and number of grids");
+
const int splineValuesOffset[DIM] = { 0, pmeGpu->kernelParams->grid.realGridSize[XX],
pmeGpu->kernelParams->grid.realGridSize[XX]
+ pmeGpu->kernelParams->grid.realGridSize[YY] };
const int newSplineValuesSize = pmeGpu->kernelParams->grid.realGridSize[XX]
+ pmeGpu->kernelParams->grid.realGridSize[YY]
+ pmeGpu->kernelParams->grid.realGridSize[ZZ];
- const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize);
- reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, newSplineValuesSize,
- &pmeGpu->archSpecific->splineValuesSize,
- &pmeGpu->archSpecific->splineValuesSizeAlloc,
+ const bool shouldRealloc = (newSplineValuesSize > pmeGpu->archSpecific->splineValuesSize[gridIndex]);
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex],
+ newSplineValuesSize, &pmeGpu->archSpecific->splineValuesSize[gridIndex],
+ &pmeGpu->archSpecific->splineValuesCapacity[gridIndex],
pmeGpu->archSpecific->deviceContext_);
if (shouldRealloc)
{
/* Reallocate the host buffer */
- pfree(pmeGpu->staging.h_splineModuli);
- pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_splineModuli),
+ pfree(pmeGpu->staging.h_splineModuli[gridIndex]);
+ pmalloc(reinterpret_cast<void**>(&pmeGpu->staging.h_splineModuli[gridIndex]),
newSplineValuesSize * sizeof(float));
}
for (int i = 0; i < DIM; i++)
{
- memcpy(pmeGpu->staging.h_splineModuli + splineValuesOffset[i],
+ memcpy(pmeGpu->staging.h_splineModuli[gridIndex] + splineValuesOffset[i],
pmeGpu->common->bsp_mod[i].data(), pmeGpu->common->bsp_mod[i].size() * sizeof(float));
}
/* TODO: pin original buffer instead! */
- copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli, pmeGpu->staging.h_splineModuli,
- 0, newSplineValuesSize, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex],
+ pmeGpu->staging.h_splineModuli[gridIndex], 0, newSplineValuesSize,
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
}
void pme_gpu_free_bspline_values(const PmeGpu* pmeGpu)
{
- pfree(pmeGpu->staging.h_splineModuli);
- freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli);
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ pfree(pmeGpu->staging.h_splineModuli[gridIndex]);
+ freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_splineModuli[gridIndex]);
+ }
}
void pme_gpu_realloc_forces(PmeGpu* pmeGpu)
pmeGpu->settings.transferKind, nullptr);
}
-void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients)
+void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
+ const float* h_coefficients,
+ const int gridIndex)
{
GMX_ASSERT(h_coefficients, "Bad host-side charge buffer in PME GPU");
const size_t newCoefficientsSize = pmeGpu->nAtomsAlloc;
GMX_ASSERT(newCoefficientsSize > 0, "Bad number of atoms in PME GPU");
- reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients, newCoefficientsSize,
- &pmeGpu->archSpecific->coefficientsSize,
- &pmeGpu->archSpecific->coefficientsSizeAlloc,
+ reallocateDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
+ newCoefficientsSize, &pmeGpu->archSpecific->coefficientsSize[gridIndex],
+ &pmeGpu->archSpecific->coefficientsCapacity[gridIndex],
pmeGpu->archSpecific->deviceContext_);
- copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients,
+ copyToDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex],
const_cast<float*>(h_coefficients), 0, pmeGpu->kernelParams->atoms.nAtoms,
pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
const size_t paddingCount = pmeGpu->nAtomsAlloc - paddingIndex;
if (paddingCount > 0)
{
- clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients, paddingIndex,
+ clearDeviceBufferAsync(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex], paddingIndex,
paddingCount, pmeGpu->archSpecific->pmeStream_);
}
}
void pme_gpu_free_coefficients(const PmeGpu* pmeGpu)
{
- freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients);
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ freeDeviceBuffer(&pmeGpu->kernelParams->atoms.d_coefficients[gridIndex]);
+ }
}
void pme_gpu_realloc_spline_data(PmeGpu* pmeGpu)
void pme_gpu_realloc_grids(PmeGpu* pmeGpu)
{
- auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+ auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+
const int newRealGridSize = kernelParamsPtr->grid.realGridSizePadded[XX]
* kernelParamsPtr->grid.realGridSizePadded[YY]
* kernelParamsPtr->grid.realGridSizePadded[ZZ];
const int newComplexGridSize = kernelParamsPtr->grid.complexGridSizePadded[XX]
* kernelParamsPtr->grid.complexGridSizePadded[YY]
* kernelParamsPtr->grid.complexGridSizePadded[ZZ] * 2;
- // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
- if (pmeGpu->archSpecific->performOutOfPlaceFFT)
- {
- /* 2 separate grids */
- reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, newComplexGridSize,
- &pmeGpu->archSpecific->complexGridSize,
- &pmeGpu->archSpecific->complexGridSizeAlloc,
- pmeGpu->archSpecific->deviceContext_);
- reallocateDeviceBuffer(
- &kernelParamsPtr->grid.d_realGrid, newRealGridSize, &pmeGpu->archSpecific->realGridSize,
- &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
- }
- else
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
- /* A single buffer so that any grid will fit */
- const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
- reallocateDeviceBuffer(
- &kernelParamsPtr->grid.d_realGrid, newGridsSize, &pmeGpu->archSpecific->realGridSize,
- &pmeGpu->archSpecific->realGridSizeAlloc, pmeGpu->archSpecific->deviceContext_);
- kernelParamsPtr->grid.d_fourierGrid = kernelParamsPtr->grid.d_realGrid;
- pmeGpu->archSpecific->complexGridSize = pmeGpu->archSpecific->realGridSize;
- // the size might get used later for copying the grid
+ // Multiplied by 2 because we count complex grid size for complex numbers, but all allocations/pointers are float
+ if (pmeGpu->archSpecific->performOutOfPlaceFFT)
+ {
+ /* 2 separate grids */
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], newComplexGridSize,
+ &pmeGpu->archSpecific->complexGridSize[gridIndex],
+ &pmeGpu->archSpecific->complexGridCapacity[gridIndex],
+ pmeGpu->archSpecific->deviceContext_);
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newRealGridSize,
+ &pmeGpu->archSpecific->realGridSize[gridIndex],
+ &pmeGpu->archSpecific->realGridCapacity[gridIndex],
+ pmeGpu->archSpecific->deviceContext_);
+ }
+ else
+ {
+ /* A single buffer so that any grid will fit */
+ const int newGridsSize = std::max(newRealGridSize, newComplexGridSize);
+ reallocateDeviceBuffer(&kernelParamsPtr->grid.d_realGrid[gridIndex], newGridsSize,
+ &pmeGpu->archSpecific->realGridSize[gridIndex],
+ &pmeGpu->archSpecific->realGridCapacity[gridIndex],
+ pmeGpu->archSpecific->deviceContext_);
+ kernelParamsPtr->grid.d_fourierGrid[gridIndex] = kernelParamsPtr->grid.d_realGrid[gridIndex];
+ pmeGpu->archSpecific->complexGridSize[gridIndex] =
+ pmeGpu->archSpecific->realGridSize[gridIndex];
+ // the size might get used later for copying the grid
+ }
}
}
void pme_gpu_free_grids(const PmeGpu* pmeGpu)
{
- if (pmeGpu->archSpecific->performOutOfPlaceFFT)
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
- freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid);
+ if (pmeGpu->archSpecific->performOutOfPlaceFFT)
+ {
+ freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_fourierGrid[gridIndex]);
+ }
+ freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex]);
}
- freeDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid);
}
void pme_gpu_clear_grids(const PmeGpu* pmeGpu)
{
- clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid, 0,
- pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_);
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ clearDeviceBufferAsync(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0,
+ pmeGpu->archSpecific->realGridSize[gridIndex],
+ pmeGpu->archSpecific->pmeStream_);
+ }
}
void pme_gpu_realloc_and_copy_fract_shifts(PmeGpu* pmeGpu)
return haveStreamTasksCompleted(pmeGpu->archSpecific->pmeStream_);
}
-void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid)
+void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, const int gridIndex)
{
- copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid, h_grid, 0, pmeGpu->archSpecific->realGridSize,
+ copyToDeviceBuffer(&pmeGpu->kernelParams->grid.d_realGrid[gridIndex], h_grid, 0,
+ pmeGpu->archSpecific->realGridSize[gridIndex],
pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
}
-void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid)
+void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, const int gridIndex)
{
- copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid, 0,
- pmeGpu->archSpecific->realGridSize, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ copyFromDeviceBuffer(h_grid, &pmeGpu->kernelParams->grid.d_realGrid[gridIndex], 0,
+ pmeGpu->archSpecific->realGridSize[gridIndex],
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
pmeGpu->archSpecific->syncSpreadGridD2H.markEvent(pmeGpu->archSpecific->pmeStream_);
}
if (pme_gpu_settings(pmeGpu).performGPUFFT)
{
pmeGpu->archSpecific->fftSetup.resize(0);
- for (int i = 0; i < pmeGpu->common->ngrids; i++)
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
{
- pmeGpu->archSpecific->fftSetup.push_back(std::make_unique<GpuParallel3dFft>(pmeGpu));
+ pmeGpu->archSpecific->fftSetup.push_back(std::make_unique<GpuParallel3dFft>(pmeGpu, gridIndex));
}
}
}
pmeGpu->archSpecific->fftSetup.resize(0);
}
-void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, PmeOutput* output)
+void pme_gpu_getEnergyAndVirial(const gmx_pme_t& pme, const float lambda, PmeOutput* output)
{
const PmeGpu* pmeGpu = pme.gpu;
- for (int j = 0; j < c_virialAndEnergyCount; j++)
- {
- GMX_ASSERT(std::isfinite(pmeGpu->staging.h_virialAndEnergy[j]),
- "PME GPU produces incorrect energy/virial.");
- }
-
- unsigned int j = 0;
- output->coulombVirial_[XX][XX] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
- output->coulombVirial_[YY][YY] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
- output->coulombVirial_[ZZ][ZZ] = 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
- output->coulombVirial_[XX][YY] = output->coulombVirial_[YY][XX] =
- 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
- output->coulombVirial_[XX][ZZ] = output->coulombVirial_[ZZ][XX] =
- 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
- output->coulombVirial_[YY][ZZ] = output->coulombVirial_[ZZ][YY] =
- 0.25F * pmeGpu->staging.h_virialAndEnergy[j++];
- output->coulombEnergy_ = 0.5F * pmeGpu->staging.h_virialAndEnergy[j++];
+
+ GMX_ASSERT(lambda == 1.0 || pmeGpu->common->ngrids == 2,
+ "Invalid combination of lambda and number of grids");
+
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ for (int j = 0; j < c_virialAndEnergyCount; j++)
+ {
+ GMX_ASSERT(std::isfinite(pmeGpu->staging.h_virialAndEnergy[gridIndex][j]),
+ "PME GPU produces incorrect energy/virial.");
+ }
+ }
+ for (int dim1 = 0; dim1 < DIM; dim1++)
+ {
+ for (int dim2 = 0; dim2 < DIM; dim2++)
+ {
+ output->coulombVirial_[dim1][dim2] = 0;
+ }
+ }
+ output->coulombEnergy_ = 0;
+ float scale = 1.0;
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ if (pmeGpu->common->ngrids == 2)
+ {
+ scale = gridIndex == 0 ? (1.0 - lambda) : lambda;
+ }
+ output->coulombVirial_[XX][XX] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][0];
+ output->coulombVirial_[YY][YY] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][1];
+ output->coulombVirial_[ZZ][ZZ] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][2];
+ output->coulombVirial_[XX][YY] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][3];
+ output->coulombVirial_[YY][XX] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][3];
+ output->coulombVirial_[XX][ZZ] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][4];
+ output->coulombVirial_[ZZ][XX] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][4];
+ output->coulombVirial_[YY][ZZ] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][5];
+ output->coulombVirial_[ZZ][YY] +=
+ scale * 0.25F * pmeGpu->staging.h_virialAndEnergy[gridIndex][5];
+ output->coulombEnergy_ += scale * 0.5F * pmeGpu->staging.h_virialAndEnergy[gridIndex][6];
+ }
+ if (pmeGpu->common->ngrids > 1)
+ {
+ output->coulombDvdl_ = 0.5F
+ * (pmeGpu->staging.h_virialAndEnergy[FEP_STATE_B][6]
+ - pmeGpu->staging.h_virialAndEnergy[FEP_STATE_A][6]);
+ }
}
/*! \brief Sets the force-related members in \p output
}
}
-PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const bool computeEnergyAndVirial)
+PmeOutput pme_gpu_getOutput(const gmx_pme_t& pme, const bool computeEnergyAndVirial, const real lambdaQ)
{
PmeGpu* pmeGpu = pme.gpu;
{
if (pme_gpu_settings(pmeGpu).performGPUSolve)
{
- pme_gpu_getEnergyAndVirial(pme, &output);
+ pme_gpu_getEnergyAndVirial(pme, lambdaQ, &output);
}
else
{
static void pme_gpu_reinit_grids(PmeGpu* pmeGpu)
{
auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
+
+ GMX_ASSERT(
+ pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+ "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+
kernelParamsPtr->grid.ewaldFactor =
(M_PI * M_PI) / (pmeGpu->common->ewaldcoeff_q * pmeGpu->common->ewaldcoeff_q);
-
/* The grid size variants */
for (int i = 0; i < DIM; i++)
{
kernelParamsPtr->grid.realGridSizePadded[ZZ] =
(kernelParamsPtr->grid.realGridSize[ZZ] / 2 + 1) * 2;
}
-
/* GPU FFT: n real elements correspond to (n / 2 + 1) complex elements in minor dimension */
kernelParamsPtr->grid.complexGridSize[ZZ] /= 2;
kernelParamsPtr->grid.complexGridSize[ZZ]++;
kernelParamsPtr->grid.complexGridSizePadded[ZZ] = kernelParamsPtr->grid.complexGridSize[ZZ];
pme_gpu_realloc_and_copy_fract_shifts(pmeGpu);
- pme_gpu_realloc_and_copy_bspline_values(pmeGpu);
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ pme_gpu_realloc_and_copy_bspline_values(pmeGpu, gridIndex);
+ }
+
pme_gpu_realloc_grids(pmeGpu);
pme_gpu_reinit_3dfft(pmeGpu);
}
/* TODO: Consider refactoring the CPU PME code to use the same structure,
* so that this function becomes 2 lines */
PmeGpu* pmeGpu = pme->gpu;
- pmeGpu->common->ngrids = pme->ngrids;
+ pmeGpu->common->ngrids = pme->bFEP_q ? 2 : 1;
pmeGpu->common->epsilon_r = pme->epsilon_r;
pmeGpu->common->ewaldcoeff_q = pme->ewaldcoeff_q;
pmeGpu->common->nk[XX] = pme->nkx;
pmeGpu->initializedClfftLibrary_ = std::make_unique<gmx::ClfftInitializer>();
pme_gpu_init_internal(pmeGpu, deviceContext, deviceStream);
- pme_gpu_alloc_energy_virial(pmeGpu);
pme_gpu_copy_common_data_from(pme);
+ pme_gpu_alloc_energy_virial(pmeGpu);
GMX_ASSERT(pmeGpu->common->epsilon_r != 0.0F, "PME GPU: bad electrostatic coefficient");
delete pmeGpu;
}
-void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* charges)
+void pme_gpu_reinit_atoms(PmeGpu* pmeGpu, const int nAtoms, const real* chargesA, const real* chargesB)
{
auto* kernelParamsPtr = pme_gpu_get_kernel_params_base_ptr(pmeGpu);
kernelParamsPtr->atoms.nAtoms = nAtoms;
GMX_RELEASE_ASSERT(false, "Only single precision supported");
GMX_UNUSED_VALUE(charges);
#else
- pme_gpu_realloc_and_copy_input_coefficients(pmeGpu, reinterpret_cast<const float*>(charges));
+ int gridIndex = 0;
/* Could also be checked for haveToRealloc, but the copy always needs to be performed */
+ pme_gpu_realloc_and_copy_input_coefficients(pmeGpu, reinterpret_cast<const float*>(chargesA), gridIndex);
+ gridIndex++;
+ if (chargesB != nullptr)
+ {
+ pme_gpu_realloc_and_copy_input_coefficients(
+ pmeGpu, reinterpret_cast<const float*>(chargesB), gridIndex);
+ }
+ else
+ {
+ /* Fill the second set of coefficients with chargesA as well to be able to avoid
+ * conditionals in the GPU kernels */
+ /* FIXME: This should be avoided by making a separate templated version of the
+ * relevant kernel(s) (probably only pme_gather_kernel). That would require a
+ * reduction of the current number of templated parameters of that kernel. */
+ pme_gpu_realloc_and_copy_input_coefficients(
+ pmeGpu, reinterpret_cast<const float*>(chargesA), gridIndex);
+ }
#endif
if (haveToRealloc)
return timingEvent;
}
-void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, int grid_index)
+void pme_gpu_3dfft(const PmeGpu* pmeGpu, gmx_fft_direction dir, const int grid_index)
{
int timerId = (dir == GMX_FFT_REAL_TO_COMPLEX) ? gtPME_FFT_R2C : gtPME_FFT_C2R;
* \param[in] pmeGpu The PME GPU structure.
* \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom
* \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
+ * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed.
*
* \return Pointer to CUDA kernel
*/
-static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool writeSplinesToGlobal)
+static auto selectSplineAndSpreadKernelPtr(const PmeGpu* pmeGpu,
+ ThreadsPerAtom threadsPerAtom,
+ bool writeSplinesToGlobal,
+ const int numGrids)
{
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
if (writeSplinesToGlobal)
{
if (threadsPerAtom == ThreadsPerAtom::Order)
{
- kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4Dual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesThPerAtom4Single;
+ }
}
else
{
- kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplines;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesDual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelWriteSplinesSingle;
+ }
}
}
else
{
if (threadsPerAtom == ThreadsPerAtom::Order)
{
- kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Dual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Single;
+ }
}
else
{
- kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelDual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelSingle;
+ }
}
}
* \param[in] pmeGpu The PME GPU structure.
* \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom
* \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
+ * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed.
*
* \return Pointer to CUDA kernel
*/
static auto selectSplineKernelPtr(const PmeGpu* pmeGpu,
ThreadsPerAtom threadsPerAtom,
- bool gmx_unused writeSplinesToGlobal)
+ bool gmx_unused writeSplinesToGlobal,
+ const int numGrids)
{
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
GMX_ASSERT(
if (threadsPerAtom == ThreadsPerAtom::Order)
{
- kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4Dual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineKernelThPerAtom4Single;
+ }
}
else
{
- kernelPtr = pmeGpu->programHandle_->impl_->splineKernel;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineKernelDual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineKernelSingle;
+ }
}
return kernelPtr;
}
* \param[in] pmeGpu The PME GPU structure.
* \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom
* \param[in] writeSplinesToGlobal bool controlling if we should write spline data to global memory
+ * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed.
*
* \return Pointer to CUDA kernel
*/
-static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool writeSplinesToGlobal)
+static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu,
+ ThreadsPerAtom threadsPerAtom,
+ bool writeSplinesToGlobal,
+ const int numGrids)
{
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
if (writeSplinesToGlobal)
{
if (threadsPerAtom == ThreadsPerAtom::Order)
{
- kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Dual;
+ }
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelThPerAtom4Single;
+ }
}
else
{
- kernelPtr = pmeGpu->programHandle_->impl_->spreadKernel;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelDual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->spreadKernelSingle;
+ }
}
}
else
using the spline and spread Kernel */
if (threadsPerAtom == ThreadsPerAtom::Order)
{
- kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Dual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelThPerAtom4Single;
+ }
}
else
{
- kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernel;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelDual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->splineAndSpreadKernelSingle;
+ }
}
}
return kernelPtr;
void pme_gpu_spread(const PmeGpu* pmeGpu,
GpuEventSynchronizer* xReadyOnDevice,
- int gmx_unused gridIndex,
- real* h_grid,
- bool computeSplines,
- bool spreadCharges)
+ real** h_grids,
+ bool computeSplines,
+ bool spreadCharges,
+ const real lambda)
{
+ GMX_ASSERT(
+ pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+ "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+
GMX_ASSERT(computeSplines || spreadCharges,
"PME spline/spread kernel has invalid input (nothing to do)");
- const auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+ auto* kernelParamsPtr = pmeGpu->kernelParams.get();
GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
const size_t blockSize = pmeGpu->programHandle_->impl_->spreadWorkGroupSize;
const int blockCount = pmeGpu->nAtomsAlloc / atomsPerBlock;
auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
+ if (pmeGpu->common->ngrids == 1)
+ {
+ kernelParamsPtr->current.scale = 1.0;
+ }
+ else
+ {
+ kernelParamsPtr->current.scale = 1.0 - lambda;
+ }
+
KernelLaunchConfig config;
config.blockSize[0] = order;
config.blockSize[1] = (pmeGpu->settings.threadsPerAtom == ThreadsPerAtom::Order ? 1 : order);
{
timingId = gtPME_SPLINEANDSPREAD;
kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines));
+ writeGlobal || (!recalculateSplines),
+ pmeGpu->common->ngrids);
}
else
{
timingId = gtPME_SPLINE;
kernelPtr = selectSplineKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines));
+ writeGlobal || (!recalculateSplines),
+ pmeGpu->common->ngrids);
}
}
else
{
timingId = gtPME_SPREAD;
kernelPtr = selectSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines));
+ writeGlobal || (!recalculateSplines), pmeGpu->common->ngrids);
}
const auto kernelArgs = prepareGpuKernelArguments(
kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_theta,
&kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
- &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->grid.d_fractShiftsTable,
- &kernelParamsPtr->grid.d_gridlineIndicesTable, &kernelParamsPtr->atoms.d_coefficients,
- &kernelParamsPtr->atoms.d_coordinates);
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable, &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], &kernelParamsPtr->atoms.d_coordinates);
#endif
launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent,
const bool copyBackGrid = spreadCharges && (!settings.performGPUFFT || settings.copyAllOutputs);
if (copyBackGrid)
{
- pme_gpu_copy_output_spread_grid(pmeGpu, h_grid);
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ float* h_grid = h_grids[gridIndex];
+ pme_gpu_copy_output_spread_grid(pmeGpu, h_grid, gridIndex);
+ }
}
const bool copyBackAtomData =
computeSplines && (!settings.performGPUGather || settings.copyAllOutputs);
}
}
-void pme_gpu_solve(const PmeGpu* pmeGpu, t_complex* h_grid, GridOrdering gridOrdering, bool computeEnergyAndVirial)
+void pme_gpu_solve(const PmeGpu* pmeGpu,
+ const int gridIndex,
+ t_complex* h_grid,
+ GridOrdering gridOrdering,
+ bool computeEnergyAndVirial)
{
+ GMX_ASSERT(
+ pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+ "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+ GMX_ASSERT(gridIndex < pmeGpu->common->ngrids,
+ "Invalid combination of gridIndex and number of grids");
+
const auto& settings = pmeGpu->settings;
const bool copyInputAndOutputGrid = !settings.performGPUFFT || settings.copyAllOutputs;
float* h_gridFloat = reinterpret_cast<float*>(h_grid);
if (copyInputAndOutputGrid)
{
- copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid, h_gridFloat, 0,
- pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ copyToDeviceBuffer(&kernelParamsPtr->grid.d_fourierGrid[gridIndex], h_gridFloat, 0,
+ pmeGpu->archSpecific->complexGridSize[gridIndex],
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
}
int majorDim = -1, middleDim = -1, minorDim = -1;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
if (gridOrdering == GridOrdering::YZX)
{
- kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernel
- : pmeGpu->programHandle_->impl_->solveYZXKernel;
+ if (gridIndex == 0)
+ {
+ kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernelA
+ : pmeGpu->programHandle_->impl_->solveYZXKernelA;
+ }
+ else
+ {
+ kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveYZXEnergyKernelB
+ : pmeGpu->programHandle_->impl_->solveYZXKernelB;
+ }
}
else if (gridOrdering == GridOrdering::XYZ)
{
- kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernel
- : pmeGpu->programHandle_->impl_->solveXYZKernel;
+ if (gridIndex == 0)
+ {
+ kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernelA
+ : pmeGpu->programHandle_->impl_->solveXYZKernelA;
+ }
+ else
+ {
+ kernelPtr = computeEnergyAndVirial ? pmeGpu->programHandle_->impl_->solveXYZEnergyKernelB
+ : pmeGpu->programHandle_->impl_->solveXYZKernelB;
+ }
}
pme_gpu_start_timing(pmeGpu, timingId);
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
const auto kernelArgs = prepareGpuKernelArguments(
- kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli,
- &kernelParamsPtr->constants.d_virialAndEnergy, &kernelParamsPtr->grid.d_fourierGrid);
+ kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->grid.d_splineModuli[gridIndex],
+ &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex],
+ &kernelParamsPtr->grid.d_fourierGrid[gridIndex]);
#endif
launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME solve",
kernelArgs);
if (computeEnergyAndVirial)
{
- copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy,
- &kernelParamsPtr->constants.d_virialAndEnergy, 0, c_virialAndEnergyCount,
- pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
+ copyFromDeviceBuffer(pmeGpu->staging.h_virialAndEnergy[gridIndex],
+ &kernelParamsPtr->constants.d_virialAndEnergy[gridIndex], 0,
+ c_virialAndEnergyCount, pmeGpu->archSpecific->pmeStream_,
+ pmeGpu->settings.transferKind, nullptr);
}
if (copyInputAndOutputGrid)
{
- copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid, 0,
- pmeGpu->archSpecific->complexGridSize, pmeGpu->archSpecific->pmeStream_,
- pmeGpu->settings.transferKind, nullptr);
+ copyFromDeviceBuffer(h_gridFloat, &kernelParamsPtr->grid.d_fourierGrid[gridIndex], 0,
+ pmeGpu->archSpecific->complexGridSize[gridIndex],
+ pmeGpu->archSpecific->pmeStream_, pmeGpu->settings.transferKind, nullptr);
}
}
* \param[in] pmeGpu The PME GPU structure.
* \param[in] threadsPerAtom Controls whether we should use order or order*order threads per atom
* \param[in] readSplinesFromGlobal bool controlling if we should write spline data to global memory
+ * \param[in] numGrids Number of grids to use. numGrids == 2 if Coulomb is perturbed.
*
* \return Pointer to CUDA kernel
*/
-inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu, ThreadsPerAtom threadsPerAtom, bool readSplinesFromGlobal)
+inline auto selectGatherKernelPtr(const PmeGpu* pmeGpu,
+ ThreadsPerAtom threadsPerAtom,
+ bool readSplinesFromGlobal,
+ const int numGrids)
{
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
{
if (threadsPerAtom == ThreadsPerAtom::Order)
{
- kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4Dual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesThPerAtom4Single;
+ }
}
else
{
- kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplines;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesDual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelReadSplinesSingle;
+ }
}
}
else
{
if (threadsPerAtom == ThreadsPerAtom::Order)
{
- kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4Dual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelThPerAtom4Single;
+ }
}
else
{
- kernelPtr = pmeGpu->programHandle_->impl_->gatherKernel;
+ if (numGrids == 2)
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelDual;
+ }
+ else
+ {
+ kernelPtr = pmeGpu->programHandle_->impl_->gatherKernelSingle;
+ }
}
}
return kernelPtr;
}
-
-void pme_gpu_gather(PmeGpu* pmeGpu, const float* h_grid)
+void pme_gpu_gather(PmeGpu* pmeGpu, real** h_grids, const float lambda)
{
+ GMX_ASSERT(
+ pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
+ "Only one (normal Coulomb PME) or two (FEP coulomb PME) PME grids can be used on GPU");
+
const auto& settings = pmeGpu->settings;
+
if (!settings.performGPUFFT || settings.copyAllOutputs)
{
- pme_gpu_copy_input_gather_grid(pmeGpu, const_cast<float*>(h_grid));
+ for (int gridIndex = 0; gridIndex < pmeGpu->common->ngrids; gridIndex++)
+ {
+ float* h_grid = const_cast<float*>(h_grids[gridIndex]);
+ pme_gpu_copy_input_gather_grid(pmeGpu, h_grid, gridIndex);
+ }
}
if (settings.copyAllOutputs)
// TODO test different cache configs
- int timingId = gtPME_GATHER;
- PmeGpuProgramImpl::PmeKernelHandle kernelPtr = selectGatherKernelPtr(
- pmeGpu, pmeGpu->settings.threadsPerAtom, readGlobal || (!recalculateSplines));
+ int timingId = gtPME_GATHER;
+ PmeGpuProgramImpl::PmeKernelHandle kernelPtr =
+ selectGatherKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom,
+ readGlobal || (!recalculateSplines), pmeGpu->common->ngrids);
// TODO design kernel selection getters and make PmeGpu a friend of PmeGpuProgramImpl
pme_gpu_start_timing(pmeGpu, timingId);
- auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
- const auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+ auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+ auto* kernelParamsPtr = pmeGpu->kernelParams.get();
+ if (pmeGpu->common->ngrids == 1)
+ {
+ kernelParamsPtr->current.scale = 1.0;
+ }
+ else
+ {
+ kernelParamsPtr->current.scale = 1.0 - lambda;
+ }
+
#if c_canEmbedBuffers
const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
const auto kernelArgs = prepareGpuKernelArguments(
- kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients,
- &kernelParamsPtr->grid.d_realGrid, &kernelParamsPtr->atoms.d_theta,
- &kernelParamsPtr->atoms.d_dtheta, &kernelParamsPtr->atoms.d_gridlineIndices,
- &kernelParamsPtr->atoms.d_forces);
+ kernelPtr, config, kernelParamsPtr, &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_theta, &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices, &kernelParamsPtr->atoms.d_forces);
#endif
launchGpuKernel(kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME gather",
kernelArgs);
struct PmeGpuSettings;
struct t_complex;
+#ifndef FEP_STATE_A
+//! Grid index of FEP state A (or unperturbed system)
+# define FEP_STATE_A 0
+#endif
+#ifndef FEP_STATE_B
+//! Grid index of FEP state B
+# define FEP_STATE_B 1
+#endif
+
namespace gmx
{
template<typename>
* Reallocates and copies the pre-computed B-spline values to the GPU.
*
* \param[in,out] pmeGpu The PME GPU structure.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*/
-void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu);
+void pme_gpu_realloc_and_copy_bspline_values(PmeGpu* pmeGpu, int gridIndex = 0);
/*! \libinternal \brief
* Frees the pre-computed B-spline values on the GPU (and the transfer CPU buffers).
*
* \param[in] pmeGpu The PME GPU structure.
* \param[in] h_coefficients The input atom charges/coefficients.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*
* Does not need to be done for every PME computation, only whenever the local charges change.
* (So, in the beginning of the run, or on DD step).
*/
-void pme_gpu_realloc_and_copy_input_coefficients(PmeGpu* pmeGpu, const float* h_coefficients);
+void pme_gpu_realloc_and_copy_input_coefficients(const PmeGpu* pmeGpu,
+ const float* h_coefficients,
+ int gridIndex = 0);
/*! \libinternal \brief
* Frees the charges/coefficients on the GPU.
/*! \libinternal \brief
* Copies the input real-space grid from the host to the GPU.
*
- * \param[in] pmeGpu The PME GPU structure.
- * \param[in] h_grid The host-side grid buffer.
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] h_grid The host-side grid buffer.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*/
-void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, float* h_grid);
+void pme_gpu_copy_input_gather_grid(const PmeGpu* pmeGpu, const float* h_grid, int gridIndex = 0);
/*! \libinternal \brief
* Copies the output real-space grid from the GPU to the host.
*
- * \param[in] pmeGpu The PME GPU structure.
- * \param[out] h_grid The host-side grid buffer.
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[out] h_grid The host-side grid buffer.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*/
-void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid);
+void pme_gpu_copy_output_spread_grid(const PmeGpu* pmeGpu, float* h_grid, int gridIndex = 0);
/*! \libinternal \brief
* Copies the spread output spline data and gridline indices from the GPU to the host.
*
- * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] pmeGpu The PME GPU structure.
*/
void pme_gpu_copy_output_spread_atom_data(const PmeGpu* pmeGpu);
/*! \libinternal \brief
* Copies the gather input spline data and gridline indices from the host to the GPU.
*
- * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] pmeGpu The PME GPU structure.
*/
void pme_gpu_copy_input_gather_atom_data(const PmeGpu* pmeGpu);
/*! \libinternal \brief
* A GPU spline computation and charge spreading function.
*
- * \param[in] pmeGpu The PME GPU structure.
- * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory;
- * can be nullptr when invoked on a separate PME rank or from PME tests.
- * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
- * \param[out] h_grid The host-side grid buffer (used only if the result of the spread is expected on the host,
- * e.g. testing or host-side FFT)
- * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
- * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory;
+ * can be nullptr when invoked on a separate PME rank or from PME tests.
+ * \param[out] h_grids The host-side grid buffers (used only if the result of the spread is expected on the host,
+ * e.g. testing or host-side FFT)
+ * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
+ * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
+ * \param[in] lambda The lambda value of the current system state.
*/
GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
- int GPU_FUNC_ARGUMENT(gridIndex),
- real* GPU_FUNC_ARGUMENT(h_grid),
+ float** GPU_FUNC_ARGUMENT(h_grids),
bool GPU_FUNC_ARGUMENT(computeSplines),
- bool GPU_FUNC_ARGUMENT(spreadCharges)) GPU_FUNC_TERM;
+ bool GPU_FUNC_ARGUMENT(spreadCharges),
+ const real GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
/*! \libinternal \brief
* 3D FFT R2C/C2R routine.
*
* \param[in] pmeGpu The PME GPU structure.
* \param[in] direction Transform direction (real-to-complex or complex-to-real)
- * \param[in] gridIndex Index of the PME grid - unused, assumed to be 0.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
*/
-void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex);
+void pme_gpu_3dfft(const PmeGpu* pmeGpu, enum gmx_fft_direction direction, int gridIndex = 0);
/*! \libinternal \brief
* A GPU Fourier space solving function.
*
* \param[in] pmeGpu The PME GPU structure.
+ * \param[in] gridIndex The index of the grid to use. 0 is Coulomb in the normal
+ * state or FEP state A and 1 is Coulomb in FEP state B.
* \param[in,out] h_grid The host-side input and output Fourier grid buffer (used only with testing or host-side FFT)
* \param[in] gridOrdering Specifies the dimenion ordering of the complex grid. TODO: store this information?
* \param[in] computeEnergyAndVirial Tells if the energy and virial computation should be performed.
*/
GPU_FUNC_QUALIFIER void pme_gpu_solve(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
+ int GPU_FUNC_ARGUMENT(gridIndex),
t_complex* GPU_FUNC_ARGUMENT(h_grid),
GridOrdering GPU_FUNC_ARGUMENT(gridOrdering),
bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial)) GPU_FUNC_TERM;
/*! \libinternal \brief
* A GPU force gathering function.
*
- * \param[in] pmeGpu The PME GPU structure.
- * reductions. \param[in] h_grid The host-side grid buffer (used only in testing mode)
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] h_grids The host-side grid buffer (used only in testing mode).
+ * \param[in] lambda The lambda value to use.
*/
-GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
- const float* GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
+ float** GPU_FUNC_ARGUMENT(h_grids),
+ float GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
+
/*! \brief Sets the device pointer to coordinate data
* \param[in] pmeGpu The PME GPU structure.
* handled the solve stage.
*
* \param[in] pme The PME structure.
+ * \param[in] lambda The lambda value to use when calculating the results.
* \param[out] output Pointer to output where energy and virial should be stored.
*/
GPU_FUNC_QUALIFIER void pme_gpu_getEnergyAndVirial(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
+ float GPU_FUNC_ARGUMENT(lambda),
PmeOutput* GPU_FUNC_ARGUMENT(output)) GPU_FUNC_TERM;
/*! \libinternal \brief
*
* \param[in] pme The PME structure.
* \param[in] computeEnergyAndVirial Whether the energy and virial are being computed
+ * \param[in] lambdaQ The Coulomb lambda to use when finalizing the output.
* \returns The output object.
*/
GPU_FUNC_QUALIFIER PmeOutput pme_gpu_getOutput(const gmx_pme_t& GPU_FUNC_ARGUMENT(pme),
- bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial))
+ bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
+ real GPU_FUNC_ARGUMENT(lambdaQ))
GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
/*! \libinternal \brief
*
* \param[in] pmeGpu The PME GPU structure.
* \param[in] nAtoms The number of particles.
- * \param[in] charges The pointer to the host-side array of particle charges.
+ * \param[in] chargesA The pointer to the host-side array of particle charges in the unperturbed state or FEP state A.
+ * \param[in] chargesB The pointer to the host-side array of particle charges in FEP state B.
*
* This is a function that should only be called in the beginning of the run and on domain
* decomposition. Should be called before the pme_gpu_set_io_ranges.
*/
GPU_FUNC_QUALIFIER void pme_gpu_reinit_atoms(PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
int GPU_FUNC_ARGUMENT(nAtoms),
- const real* GPU_FUNC_ARGUMENT(charges)) GPU_FUNC_TERM;
+ const real* GPU_FUNC_ARGUMENT(chargesA),
+ const real* GPU_FUNC_ARGUMENT(chargesB) = nullptr) GPU_FUNC_TERM;
/*! \brief \libinternal
* The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
*
* \param[in] pme The PME data structure.
* \param[in] computeEnergyAndVirial Tells if the energy and virial computation should be performed.
+ * \param[in] lambdaQ The Coulomb lambda to use when calculating the results.
* \param[out] wcycle The wallclock counter.
* \return The output forces, energy and virial
*/
GPU_FUNC_QUALIFIER PmeOutput pme_gpu_wait_finish_task(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
bool GPU_FUNC_ARGUMENT(computeEnergyAndVirial),
+ real GPU_FUNC_ARGUMENT(lambdaQ),
gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle))
GPU_FUNC_TERM_WITH_RETURN(PmeOutput{});
constexpr bool c_wrapX = true;
constexpr bool c_wrapY = true;
+constexpr int c_stateA = 0;
+constexpr int c_stateB = 1;
+
//! PME CUDA kernels forward declarations. Kernels are documented in their respective files.
-template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int mode, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
// clang-format off
extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
extern template void
-pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-// clang-format on
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order>(const PmeGpuCudaKernelParams);
+extern template void
+pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template<GridOrdering gridOrdering, bool computeEnergyAndVirial>
+template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex> /* It is significantly slower to pass gridIndex as a kernel parameter */
void pme_solve_kernel(const PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
// clang-format off
-extern template void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::XYZ, true> (const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
-extern template void pme_solve_kernel<GridOrdering::YZX, true> (const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::XYZ, false, c_stateA>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::XYZ, true, c_stateA>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::YZX, false, c_stateA>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::YZX, true, c_stateA>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::XYZ, false, c_stateB>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::XYZ, true, c_stateB>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::YZX, false, c_stateB>(const PmeGpuCudaKernelParams);
+extern template void pme_solve_kernel<GridOrdering::YZX, true, c_stateB>(const PmeGpuCudaKernelParams);
// clang-format on
-template<int order, bool wrapX, bool wrapY, bool readGlobal, ThreadsPerAtom threadsPerAtom>
+template<int order, bool wrapX, bool wrapY, int nGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom>
void pme_gather_kernel(const PmeGpuCudaKernelParams kernelParams);
// Add extern declarations to inform that there will be a definition
// provided in another translation unit.
// clang-format off
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+extern template void pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
// clang-format on
PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
* Similarly the spread kernel (without the spline) implies that we should read the spline data from global memory
*/
// clang-format off
- splineAndSpreadKernel = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>;
- splineAndSpreadKernelThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, false, ThreadsPerAtom::Order>;
- splineAndSpreadKernelWriteSplines = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, ThreadsPerAtom::OrderSquared>;
- splineAndSpreadKernelWriteSplinesThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, true, ThreadsPerAtom::Order>;
- splineKernel = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, ThreadsPerAtom::OrderSquared>;
- splineKernelThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, true, ThreadsPerAtom::Order>;
- spreadKernel = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, ThreadsPerAtom::OrderSquared>;
- spreadKernelThPerAtom4 = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, true, ThreadsPerAtom::Order>;
- gatherKernel = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::OrderSquared>;
- gatherKernelThPerAtom4 = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, false, ThreadsPerAtom::Order>;
- gatherKernelReadSplines = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true, ThreadsPerAtom::OrderSquared>;
- gatherKernelReadSplinesThPerAtom4 = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, true, ThreadsPerAtom::Order>;
- solveXYZKernel = pme_solve_kernel<GridOrdering::XYZ, false>;
- solveXYZEnergyKernel = pme_solve_kernel<GridOrdering::XYZ, true>;
- solveYZXKernel = pme_solve_kernel<GridOrdering::YZX, false>;
- solveYZXEnergyKernel = pme_solve_kernel<GridOrdering::YZX, true>;
+ splineAndSpreadKernelSingle = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>;
+ splineAndSpreadKernelThPerAtom4Single = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order>;
+ splineAndSpreadKernelWriteSplinesSingle = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>;
+ splineAndSpreadKernelWriteSplinesThPerAtom4Single = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>;
+ splineKernelSingle = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>;
+ splineKernelThPerAtom4Single = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>;
+ spreadKernelSingle = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>;
+ spreadKernelThPerAtom4Single = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>;
+ splineAndSpreadKernelDual = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>;
+ splineAndSpreadKernelThPerAtom4Dual = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order>;
+ splineAndSpreadKernelWriteSplinesDual = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>;
+ splineAndSpreadKernelWriteSplinesThPerAtom4Dual = pme_spline_and_spread_kernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>;
+ splineKernelDual = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>;
+ splineKernelThPerAtom4Dual = pme_spline_and_spread_kernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>;
+ spreadKernelDual = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>;
+ spreadKernelThPerAtom4Dual = pme_spline_and_spread_kernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>;
+ gatherKernelSingle = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared>;
+ gatherKernelThPerAtom4Single = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order>;
+ gatherKernelReadSplinesSingle = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared>;
+ gatherKernelReadSplinesThPerAtom4Single = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order>;
+ gatherKernelDual = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared>;
+ gatherKernelThPerAtom4Dual = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order>;
+ gatherKernelReadSplinesDual = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared>;
+ gatherKernelReadSplinesThPerAtom4Dual = pme_gather_kernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order>;
+ solveXYZKernelA = pme_solve_kernel<GridOrdering::XYZ, false, c_stateA>;
+ solveXYZEnergyKernelA = pme_solve_kernel<GridOrdering::XYZ, true, c_stateA>;
+ solveYZXKernelA = pme_solve_kernel<GridOrdering::YZX, false, c_stateA>;
+ solveYZXEnergyKernelA = pme_solve_kernel<GridOrdering::YZX, true, c_stateA>;
+ solveXYZKernelB = pme_solve_kernel<GridOrdering::XYZ, false, c_stateB>;
+ solveXYZEnergyKernelB = pme_solve_kernel<GridOrdering::XYZ, true, c_stateB>;
+ solveYZXKernelB = pme_solve_kernel<GridOrdering::YZX, false, c_stateB>;
+ solveYZXEnergyKernelB = pme_solve_kernel<GridOrdering::YZX, true, c_stateB>;
// clang-format on
}
* or recalculated in the gather.
* Spreading kernels also have hardcoded X/Y indices wrapping parameters,
* as a placeholder for implementing 1/2D decomposition.
+ * The kernels are templated separately for spreading on one grid (one or
+ * two sets of coefficients) or on two grids (required for energy and virial
+ * calculations).
*/
size_t spreadWorkGroupSize;
- PmeKernelHandle splineKernel;
- PmeKernelHandle splineKernelThPerAtom4;
- PmeKernelHandle spreadKernel;
- PmeKernelHandle spreadKernelThPerAtom4;
- PmeKernelHandle splineAndSpreadKernel;
- PmeKernelHandle splineAndSpreadKernelThPerAtom4;
- PmeKernelHandle splineAndSpreadKernelWriteSplines;
- PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4;
+ PmeKernelHandle splineKernelSingle;
+ PmeKernelHandle splineKernelThPerAtom4Single;
+ PmeKernelHandle spreadKernelSingle;
+ PmeKernelHandle spreadKernelThPerAtom4Single;
+ PmeKernelHandle splineAndSpreadKernelSingle;
+ PmeKernelHandle splineAndSpreadKernelThPerAtom4Single;
+ PmeKernelHandle splineAndSpreadKernelWriteSplinesSingle;
+ PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4Single;
+ PmeKernelHandle splineKernelDual;
+ PmeKernelHandle splineKernelThPerAtom4Dual;
+ PmeKernelHandle spreadKernelDual;
+ PmeKernelHandle spreadKernelThPerAtom4Dual;
+ PmeKernelHandle splineAndSpreadKernelDual;
+ PmeKernelHandle splineAndSpreadKernelThPerAtom4Dual;
+ PmeKernelHandle splineAndSpreadKernelWriteSplinesDual;
+ PmeKernelHandle splineAndSpreadKernelWriteSplinesThPerAtom4Dual;
//@}
//@{
* it can either reduce with previous forces in the host buffer, or ignore them.
* Also similarly to the gather we can use either order(4) or order*order (16) threads per atom
* and either recalculate the splines or read the ones written by the spread
+ * The kernels are templated separately for using one or two grids (required for
+ * calculating energies and virial).
*/
size_t gatherWorkGroupSize;
- PmeKernelHandle gatherKernel;
- PmeKernelHandle gatherKernelThPerAtom4;
- PmeKernelHandle gatherKernelReadSplines;
- PmeKernelHandle gatherKernelReadSplinesThPerAtom4;
+ PmeKernelHandle gatherKernelSingle;
+ PmeKernelHandle gatherKernelThPerAtom4Single;
+ PmeKernelHandle gatherKernelReadSplinesSingle;
+ PmeKernelHandle gatherKernelReadSplinesThPerAtom4Single;
+ PmeKernelHandle gatherKernelDual;
+ PmeKernelHandle gatherKernelThPerAtom4Dual;
+ PmeKernelHandle gatherKernelReadSplinesDual;
+ PmeKernelHandle gatherKernelReadSplinesThPerAtom4Dual;
//@}
//@{
/** Solve kernel doesn't care about the interpolation order, but can optionally
* compute energy and virial, and supports XYZ and YZX grid orderings.
+ * The kernels are templated separately for grids in state A and B.
*/
size_t solveMaxWorkGroupSize;
- PmeKernelHandle solveYZXKernel;
- PmeKernelHandle solveXYZKernel;
- PmeKernelHandle solveYZXEnergyKernel;
- PmeKernelHandle solveXYZEnergyKernel;
+ PmeKernelHandle solveYZXKernelA;
+ PmeKernelHandle solveXYZKernelA;
+ PmeKernelHandle solveYZXEnergyKernelA;
+ PmeKernelHandle solveXYZEnergyKernelA;
+ PmeKernelHandle solveYZXKernelB;
+ PmeKernelHandle solveXYZKernelB;
+ PmeKernelHandle solveYZXEnergyKernelB;
+ PmeKernelHandle solveXYZEnergyKernelB;
//@}
PmeGpuProgramImpl() = delete;
{
// TODO: log releasing errors
cl_int gmx_used_in_debug stat = 0;
- stat |= clReleaseKernel(splineAndSpreadKernel);
- stat |= clReleaseKernel(splineKernel);
- stat |= clReleaseKernel(spreadKernel);
- stat |= clReleaseKernel(gatherKernel);
- stat |= clReleaseKernel(solveXYZKernel);
- stat |= clReleaseKernel(solveXYZEnergyKernel);
- stat |= clReleaseKernel(solveYZXKernel);
- stat |= clReleaseKernel(solveYZXEnergyKernel);
+ stat |= clReleaseKernel(splineAndSpreadKernelSingle);
+ stat |= clReleaseKernel(splineKernelSingle);
+ stat |= clReleaseKernel(spreadKernelSingle);
+ stat |= clReleaseKernel(splineAndSpreadKernelDual);
+ stat |= clReleaseKernel(splineKernelDual);
+ stat |= clReleaseKernel(spreadKernelDual);
+ stat |= clReleaseKernel(gatherKernelSingle);
+ stat |= clReleaseKernel(gatherKernelDual);
+ stat |= clReleaseKernel(solveXYZKernelA);
+ stat |= clReleaseKernel(solveXYZEnergyKernelA);
+ stat |= clReleaseKernel(solveYZXKernelA);
+ stat |= clReleaseKernel(solveYZXEnergyKernelA);
+ stat |= clReleaseKernel(solveXYZKernelB);
+ stat |= clReleaseKernel(solveXYZEnergyKernelB);
+ stat |= clReleaseKernel(solveYZXKernelB);
+ stat |= clReleaseKernel(solveYZXEnergyKernelB);
GMX_ASSERT(stat == CL_SUCCESS,
gmx::formatString("Failed to release PME OpenCL resources %d: %s", stat,
ocl_get_error_string(stat).c_str())
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
- constexpr cl_uint expectedKernelCount = 9;
+ constexpr cl_uint expectedKernelCount = 18;
// Has to be equal or larger than the number of kernel instances.
// If it is not, CL_INVALID_VALUE will be thrown.
std::vector<cl_kernel> kernels(expectedKernelCount, nullptr);
// The names below must correspond to those defined in pme_program.cl
// TODO use a map with string key instead?
- if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernel"))
+ if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernelSingle"))
{
- splineKernel = kernel;
+ splineKernelSingle = kernel;
}
- else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernelSingle"))
{
- splineAndSpreadKernel = kernel;
- splineAndSpreadKernelWriteSplines = kernel;
- checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
+ splineAndSpreadKernelSingle = kernel;
+ splineAndSpreadKernelWriteSplinesSingle = kernel;
+ checkRequiredWarpSize(splineAndSpreadKernelSingle, kernelNamesBuffer.data(), deviceInfo);
}
- else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernelSingle"))
{
- spreadKernel = kernel;
- checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
+ spreadKernelSingle = kernel;
+ checkRequiredWarpSize(spreadKernelSingle, kernelNamesBuffer.data(), deviceInfo);
}
- else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineKernelDual"))
{
- gatherKernel = kernel;
- gatherKernelReadSplines = kernel;
- checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
+ splineKernelDual = kernel;
}
- else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernelDual"))
{
- solveYZXKernel = kernel;
+ splineAndSpreadKernelDual = kernel;
+ splineAndSpreadKernelWriteSplinesDual = kernel;
+ checkRequiredWarpSize(splineAndSpreadKernelDual, kernelNamesBuffer.data(), deviceInfo);
}
- else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernel"))
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernelDual"))
{
- solveYZXEnergyKernel = kernel;
+ spreadKernelDual = kernel;
+ checkRequiredWarpSize(spreadKernelDual, kernelNamesBuffer.data(), deviceInfo);
}
- else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernel"))
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernelSingle"))
{
- solveXYZKernel = kernel;
+ gatherKernelSingle = kernel;
+ gatherKernelReadSplinesSingle = kernel;
+ checkRequiredWarpSize(gatherKernelSingle, kernelNamesBuffer.data(), deviceInfo);
}
- else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernel"))
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernelDual"))
{
- solveXYZEnergyKernel = kernel;
+ gatherKernelDual = kernel;
+ gatherKernelReadSplinesDual = kernel;
+ checkRequiredWarpSize(gatherKernelDual, kernelNamesBuffer.data(), deviceInfo);
+ }
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernelA"))
+ {
+ solveYZXKernelA = kernel;
+ }
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernelA"))
+ {
+ solveYZXEnergyKernelA = kernel;
+ }
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernelA"))
+ {
+ solveXYZKernelA = kernel;
+ }
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernelA"))
+ {
+ solveXYZEnergyKernelA = kernel;
+ }
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernelB"))
+ {
+ solveYZXKernelB = kernel;
+ }
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXEnergyKernelB"))
+ {
+ solveYZXEnergyKernelB = kernel;
+ }
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZKernelB"))
+ {
+ solveXYZKernelB = kernel;
+ }
+ else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveXYZEnergyKernelB"))
+ {
+ solveXYZEnergyKernelB = kernel;
}
}
clReleaseProgram(program);
#include "gromacs/gpu_utils/hostallocator.h"
#include "gromacs/math/vectypes.h"
+#ifndef NUM_STATES
+//! Number of FEP states.
+# define NUM_STATES 2
+#endif
+
/*! \internal \brief
* The PME GPU intermediate buffers structure, included in the main PME GPU structure by value.
* Buffers are managed by the PME GPU module.
gmx::PaddedHostVector<gmx::RVec> h_forces;
/*! \brief Virial and energy intermediate host-side buffer. Size is PME_GPU_VIRIAL_AND_ENERGY_COUNT. */
- float* h_virialAndEnergy;
+ float* h_virialAndEnergy[NUM_STATES];
/*! \brief B-spline values intermediate host-side buffer. */
- float* h_splineModuli;
+ float* h_splineModuli[NUM_STATES];
/*! \brief Pointer to the host memory with B-spline values. Only used for host-side gather, or unit tests */
float* h_theta;
# define HIDE_FROM_OPENCL_COMPILER(x) char8
#endif
+#ifndef NUMFEPSTATES
+//! Number of FEP states.
+# define NUMFEPSTATES 2
+#endif
+
/* What follows is all the PME GPU function arguments,
* sorted into several device-side structures depending on the update rate.
* This is GPU agnostic (float3 replaced by float[3], etc.).
float elFactor;
/*! \brief Virial and energy GPU array. Size is c_virialAndEnergyCount (7) floats.
* The element order is virxx, viryy, virzz, virxy, virxz, viryz, energy. */
- HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_virialAndEnergy;
+ HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_virialAndEnergy[NUMFEPSTATES];
};
/*! \internal \brief
/* Grid arrays */
/*! \brief Real space grid. */
- HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_realGrid;
+ HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_realGrid[NUMFEPSTATES];
/*! \brief Complex grid - used in FFT/solve. If inplace cu/clFFT is used, then it is the same handle as realGrid. */
- HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fourierGrid;
+ HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fourierGrid[NUMFEPSTATES];
/*! \brief Grid spline values as in pme->bsp_mod
* (laid out sequentially (XXX....XYYY......YZZZ.....Z))
*/
- HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_splineModuli;
+ HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_splineModuli[NUMFEPSTATES];
/*! \brief Fractional shifts lookup table as in pme->fshx/fshy/fshz, laid out sequentially (XXX....XYYY......YZZZ.....Z) */
HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_fractShiftsTable;
/*! \brief Gridline indices lookup table
* but reallocation happens only at DD.
*/
HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<gmx::RVec>) d_coordinates;
- /*! \brief Global GPU memory array handle with input atom charges.
+ /*! \brief Global GPU memory array handle with input atom charges in states A and B.
* The charges only need to be reallocated and copied to the GPU at DD step.
*/
- HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coefficients;
+ HIDE_FROM_OPENCL_COMPILER(DeviceBuffer<float>) d_coefficients[NUMFEPSTATES];
/*! \brief Global GPU memory array handle with input/output rvec atom forces.
* The forces change and need to be copied from (and possibly to) the GPU for every PME
* computation, but reallocation happens only at DD.
float recipBox[DIM][DIM];
/*! \brief The unit cell volume for solving. */
float boxVolume;
+
+ /*! \brief The current coefficient scaling value. */
+ float scale;
};
/*! \internal \brief
*/
struct PmeShared
{
- /*! \brief Grid count - currently always 1 on GPU */
+ /*! \brief Grid count */
int ngrids;
/*! \brief Grid dimensions - nkx, nky, nkz */
int nk[DIM];
#include "pme_gpu_3dfft.h"
+#ifndef NUMFEPSTATES
+//! Number of FEP states.
+# define NUMFEPSTATES 2
+#endif
+
class GpuParallel3dFft;
/*! \internal \brief
/*! \brief Both the kernelParams.atoms.theta and kernelParams.atoms.dtheta float element count (reserved) */
int splineDataSizeAlloc = 0;
/*! \brief The kernelParams.atoms.coefficients float element count (actual) */
- int coefficientsSize = 0;
+ int coefficientsSize[NUMFEPSTATES] = { 0, 0 };
/*! \brief The kernelParams.atoms.coefficients float element count (reserved) */
- int coefficientsSizeAlloc = 0;
+ int coefficientsCapacity[NUMFEPSTATES] = { 0, 0 };
/*! \brief The kernelParams.grid.splineValuesArray float element count (actual) */
- int splineValuesSize = 0;
+ int splineValuesSize[NUMFEPSTATES] = { 0, 0 };
/*! \brief The kernelParams.grid.splineValuesArray float element count (reserved) */
- int splineValuesSizeAlloc = 0;
+ int splineValuesCapacity[NUMFEPSTATES] = { 0, 0 };
/*! \brief The kernelParams.grid.realGrid float element count (actual) */
- int realGridSize = 0;
+ int realGridSize[NUMFEPSTATES] = { 0, 0 };
/*! \brief The kernelParams.grid.realGrid float element count (reserved) */
- int realGridSizeAlloc = 0;
+ int realGridCapacity[NUMFEPSTATES] = { 0, 0 };
/*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (actual) */
- int complexGridSize = 0;
+ int complexGridSize[NUMFEPSTATES] = { 0, 0 };
/*! \brief The kernelParams.grid.fourierGrid float (not float2!) element count (reserved) */
- int complexGridSizeAlloc = 0;
+ int complexGridCapacity[NUMFEPSTATES] = { 0, 0 };
};
#endif
{
if (atomSetChanged)
{
- gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA.data());
+ gmx_pme_reinit_atoms(pme, nat, pme_pp->chargeA.data(), pme_pp->chargeB.data());
if (useGpuForPme)
{
stateGpu->reinit(nat, nat);
stepWork.computeVirial = computeEnergyAndVirial;
stepWork.computeEnergy = computeEnergyAndVirial;
stepWork.computeForces = true;
- PmeOutput output;
+ PmeOutput output = { {}, false, 0, { { 0 } }, 0, 0, { { 0 } }, 0 };
if (useGpuForPme)
{
stepWork.haveDynamicBox = false;
// TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157
auto xReadyOnDevice = nullptr;
- pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle);
+ pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle, lambda_q);
pme_gpu_launch_complex_transforms(pme, wcycle, stepWork);
- pme_gpu_launch_gather(pme, wcycle);
- output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, wcycle);
+ pme_gpu_launch_gather(pme, wcycle, lambda_q);
+ output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle);
pme_gpu_reinit_computation(pme, wcycle);
}
else
real coulombEnergy_ = 0;
//!< Host staging area for PME coulomb virial contributions
matrix coulombVirial_ = { { 0 } };
+ //!< Host staging area for PME coulomb dVdl.
+ real coulombDvdl_ = 0;
//!< Host staging area for PME LJ energy
real lennardJonesEnergy_ = 0;
//!< Host staging area for PME LJ virial contributions
matrix lennardJonesVirial_ = { { 0 } };
+ //!< Host staging area for PME LJ dVdl. (Not used)
+ real lennardJonesDvdl_ = 0;
};
#endif
#define atomsPerBlock (c_spreadWorkGroupSize / threadsPerAtom)
#define atomsPerWarp (warp_size / threadsPerAtom)
-// spline/spread fused
+// spline/spread fused single grid (without FEP or FEP without energy and virial calculation at this step)
#define computeSplines 1
#define spreadCharges 1
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernel
+#define numGrids 1 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernelSingle
#include "pme_spread.clh"
#undef computeSplines
#undef spreadCharges
+#undef numGrids
#undef CUSTOMIZED_KERNEL_NAME
-// spline
+// spline on single grid (without FEP or FEP without energy and virial calculation at this step)
#define computeSplines 1
#define spreadCharges 0
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernel
+#define numGrids 1 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernelSingle
#include "pme_spread.clh"
#undef computeSplines
#undef spreadCharges
+#undef numGrids
#undef CUSTOMIZED_KERNEL_NAME
-// spread
+// spread on single grid (without FEP or FEP without energy and virial calculation at this step)
#define computeSplines 0
#define spreadCharges 1
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernel
+#define numGrids 1 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernelSingle
#include "pme_spread.clh"
#undef computeSplines
#undef spreadCharges
+#undef numGrids
+#undef CUSTOMIZED_KERNEL_NAME
+
+// spline/spread fused on two grids (FEP with energy and virial calculation at this step)
+#define computeSplines 1
+#define spreadCharges 1
+#define numGrids 2 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineAndSpreadKernelDual
+#include "pme_spread.clh"
+#undef computeSplines
+#undef spreadCharges
+#undef numGrids
+#undef CUSTOMIZED_KERNEL_NAME
+
+// spline on two grids (FEP with energy and virial calculation at this step)
+#define computeSplines 1
+#define spreadCharges 0
+#define numGrids 2 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSplineKernelDual
+#include "pme_spread.clh"
+#undef computeSplines
+#undef spreadCharges
+#undef numGrids
+#undef CUSTOMIZED_KERNEL_NAME
+
+// spread on two grids (FEP with energy and virial calculation at this step)
+#define computeSplines 0
+#define spreadCharges 1
+#define numGrids 2 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSpreadKernelDual
+#include "pme_spread.clh"
+#undef computeSplines
+#undef spreadCharges
+#undef numGrids
#undef CUSTOMIZED_KERNEL_NAME
#define atomsPerBlock (c_gatherWorkGroupSize / threadsPerAtom)
-// gather
-#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernel
+// gather using single grid
+#define numGrids 1 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernelSingle
#include "pme_gather.clh"
+#undef numGrids
#undef CUSTOMIZED_KERNEL_NAME
+// gather using two grids
+#define numGrids 2 /* Using define to avoid a conditional inside the function. */
+#define CUSTOMIZED_KERNEL_NAME(x) pmeGatherKernelDual
+#include "pme_gather.clh"
+#undef numGrids
+#undef CUSTOMIZED_KERNEL_NAME
#undef atomsPerBlock
#define YZX 1
#define XYZ 2
-// solve, YZX dimension order
+// solve, YZX dimension order state A
+#define gridOrdering YZX
+#define computeEnergyAndVirial 0
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernelA
+#include "pme_solve.clh"
+#undef gridOrdering
+#undef computeEnergyAndVirial
+#undef CUSTOMIZED_KERNEL_NAME
+
+// solve with reduction, YZX dimension order state A
+#define gridOrdering YZX
+#define computeEnergyAndVirial 1
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernelA
+#include "pme_solve.clh"
+#undef gridOrdering
+#undef computeEnergyAndVirial
+#undef CUSTOMIZED_KERNEL_NAME
+
+// solve, XYZ dimension order state A
+#define gridOrdering XYZ
+#define computeEnergyAndVirial 0
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernelA
+#include "pme_solve.clh"
+#undef gridOrdering
+#undef computeEnergyAndVirial
+#undef CUSTOMIZED_KERNEL_NAME
+
+// solve with reduction, XYZ dimension order state A
+#define gridOrdering XYZ
+#define computeEnergyAndVirial 1
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernelA
+#include "pme_solve.clh"
+#undef gridOrdering
+#undef computeEnergyAndVirial
+#undef CUSTOMIZED_KERNEL_NAME
+
+// solve, YZX dimension order state B
#define gridOrdering YZX
#define computeEnergyAndVirial 0
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernel
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXKernelB
#include "pme_solve.clh"
#undef gridOrdering
#undef computeEnergyAndVirial
#undef CUSTOMIZED_KERNEL_NAME
-// solve with reduction, YZX dimension order
+// solve with reduction, YZX dimension order state B
#define gridOrdering YZX
#define computeEnergyAndVirial 1
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernel
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveYZXEnergyKernelB
#include "pme_solve.clh"
#undef gridOrdering
#undef computeEnergyAndVirial
#undef CUSTOMIZED_KERNEL_NAME
-// solve, XYZ dimension order
+// solve, XYZ dimension order state B
#define gridOrdering XYZ
#define computeEnergyAndVirial 0
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernel
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZKernelB
#include "pme_solve.clh"
#undef gridOrdering
#undef computeEnergyAndVirial
#undef CUSTOMIZED_KERNEL_NAME
-// solve with reduction, XYZ dimension order
+// solve with reduction, XYZ dimension order state B
#define gridOrdering XYZ
#define computeEnergyAndVirial 1
-#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernel
+#define CUSTOMIZED_KERNEL_NAME(x) pmeSolveXYZEnergyKernelB
#include "pme_solve.clh"
#undef gridOrdering
#undef computeEnergyAndVirial
* \param[in] kernelParams Input PME GPU data in constant memory.
* \param[in] gm_splineModuli B-Spline moduli.
* \param[out] gm_virialAndEnergy Reduced virial and enrgy (only with computeEnergyAndVirial ==
- * true) \param[in,out] gm_grid Fourier grid to transform.
+ * true)
+ * \param[in,out] gm_grid Fourier grid to transform.
*/
__attribute__((work_group_size_hint(c_solveMaxWorkGroupSize, 1, 1)))
__kernel void CUSTOMIZED_KERNEL_NAME(pme_solve_kernel)(const struct PmeOpenCLKernelParams kernelParams,
& (gridLineIndex < gridLinesPerBlock))
{
/* The offset should be equal to the global thread index for coalesced access */
- const int gridIndex = (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
- __global float2* __restrict__ gm_gridCell = gm_grid + gridIndex;
+ const int gridThreadIndex =
+ (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
+ __global float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex;
const int kMajor = indexMajor + localOffsetMajor;
/* Checking either X in XYZ, or Y in YZX cases */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* PME complex grid solver kernel function.
*
* \tparam[in] gridOrdering Specifies the dimension ordering of the complex grid.
- * \tparam[in] computeEnergyAndVirial Tells if the reciprocal energy and virial should be
- * computed. \param[in] kernelParams Input PME CUDA data in constant memory.
+ * \tparam[in] computeEnergyAndVirial Tells if the reciprocal energy and virial should be computed.
+ * \tparam[in] gridIndex The index of the grid to use in the kernel.
+ * \param[in] kernelParams Input PME CUDA data in constant memory.
*/
-template<GridOrdering gridOrdering, bool computeEnergyAndVirial>
+template<GridOrdering gridOrdering, bool computeEnergyAndVirial, const int gridIndex>
__launch_bounds__(c_solveMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__
void pme_solve_kernel(const struct PmeGpuCudaKernelParams kernelParams)
{
}
/* Global memory pointers */
- const float* __restrict__ gm_splineValueMajor =
- kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[majorDim];
- const float* __restrict__ gm_splineValueMiddle =
- kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[middleDim];
- const float* __restrict__ gm_splineValueMinor =
- kernelParams.grid.d_splineModuli + kernelParams.grid.splineValuesOffset[minorDim];
- float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy;
- float2* __restrict__ gm_grid = (float2*)kernelParams.grid.d_fourierGrid;
+ const float* __restrict__ gm_splineValueMajor = kernelParams.grid.d_splineModuli[gridIndex]
+ + kernelParams.grid.splineValuesOffset[majorDim];
+ const float* __restrict__ gm_splineValueMiddle = kernelParams.grid.d_splineModuli[gridIndex]
+ + kernelParams.grid.splineValuesOffset[middleDim];
+ const float* __restrict__ gm_splineValueMinor = kernelParams.grid.d_splineModuli[gridIndex]
+ + kernelParams.grid.splineValuesOffset[minorDim];
+ float* __restrict__ gm_virialAndEnergy = kernelParams.constants.d_virialAndEnergy[gridIndex];
+ float2* __restrict__ gm_grid = (float2*)kernelParams.grid.d_fourierGrid[gridIndex];
/* Various grid sizes and indices */
const int localOffsetMinor = 0, localOffsetMajor = 0, localOffsetMiddle = 0; // unused
& (gridLineIndex < gridLinesPerBlock))
{
/* The offset should be equal to the global thread index for coalesced access */
- const int gridIndex = (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
- float2* __restrict__ gm_gridCell = gm_grid + gridIndex;
+ const int gridThreadIndex =
+ (indexMajor * localSizeMiddle + indexMiddle) * localSizeMinor + indexMinor;
+ float2* __restrict__ gm_gridCell = gm_grid + gridThreadIndex;
const int kMajor = indexMajor + localOffsetMajor;
/* Checking either X in XYZ, or Y in YZX cases */
}
//! Kernel instantiations
-template __global__ void pme_solve_kernel<GridOrdering::YZX, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_solve_kernel<GridOrdering::YZX, false>(const PmeGpuCudaKernelParams);
-template __global__ void pme_solve_kernel<GridOrdering::XYZ, true>(const PmeGpuCudaKernelParams);
-template __global__ void pme_solve_kernel<GridOrdering::XYZ, false>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, true, 0>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, false, 0>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, true, 0>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, false, 0>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, true, 1>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::YZX, false, 1>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, true, 1>(const PmeGpuCudaKernelParams);
+template __global__ void pme_solve_kernel<GridOrdering::XYZ, false, 1>(const PmeGpuCudaKernelParams);
*
* \param[out] sm_destination Local memory array for output.
* \param[in] gm_source Global memory array for input.
- * \param[in] dataCountPerAtom Number of data elements per single atom (e.g. DIM for an rvec coordinates array).
+ * \param[in] dataCountPerAtom Number of data elements per single
+ * atom (e.g. DIM for an rvec coordinates array).
*
*/
inline void pme_gpu_stage_atom_data(__local float* __restrict__ sm_destination,
* \param[out] gm_dtheta Atom spline parameter derivatives
* \param[out] gm_gridlineIndices Same as \p sm_gridlineIndices but in global memory.
* \param[in] gm_fractShiftsTable Atom fractional coordinates correction table
- * \param[in] gm_gridlineIndicesTable Atom fractional coordinates correction table
+ * \param[in] gm_gridlineIndicesTable Table with atom gridline indices
*/
gmx_opencl_inline void calculate_splines(const struct PmeOpenCLKernelParams kernelParams,
const int atomIndexOffset,
* \param[in,out] gm_theta Atom spline parameter values
* \param[out] gm_dtheta Atom spline parameter derivatives
* \param[in,out] gm_gridlineIndices Atom gridline indices (ivec)
- * \param[out] gm_grid Global 3D grid for charge spreading.
+ * \param[out] gm_gridA Global 3D grid for charge spreading.
+ * Grid for the unperturbed state, FEP state A or the single grid used for
+ * interpolated coefficients on one grid in FEP A/B.
+ * \param[out] gm_gridB Global 3D grid for charge spreading.
+ * FEP state B when using dual grids (when calculating energy and virials).
* \param[in] gm_fractShiftsTable Atom fractional coordinates correction table
- * \param[in] gm_gridlineIndicesTable Atom fractional coordinates correction table
- * \param[in] gm_coefficients Atom charges/coefficients.
+ * \param[in] gm_gridlineIndicesTable Table with atom gridline indices
+ * \param[in] gm_coefficientsA Atom charges/coefficients of the unperturbed state
+ * or FEP state A.
+ * \param[in] gm_coefficientsB Atom charges/coefficients in FEP state B. Only
+ * used when spreading interpolated coefficients on one grid.
* \param[in] gm_coordinates Atom coordinates (rvec)
*/
__attribute__((reqd_work_group_size(order, order, atomsPerBlock))) __kernel void CUSTOMIZED_KERNEL_NAME(
__global float* __restrict__ gm_theta,
__global float* __restrict__ gm_dtheta,
__global int* __restrict__ gm_gridlineIndices,
- __global float* __restrict__ gm_grid,
+ __global float* __restrict__ gm_gridA,
+ __global float* __restrict__ gm_gridB,
__global const float* __restrict__ gm_fractShiftsTable,
__global const int* __restrict__ gm_gridlineIndicesTable,
- __global const float* __restrict__ gm_coefficients,
+ __global const float* __restrict__ gm_coefficientsA,
+ __global const float* __restrict__ gm_coefficientsB,
__global const float* __restrict__ gm_coordinates)
{
// Gridline indices, ivec
__local float sm_coordinates[DIM * atomsPerBlock];
const int atomIndexOffset = (int)get_group_id(XX) * atomsPerBlock;
+ int gridIndex = 0;
/* Staging coefficients/charges for both spline and spread */
- pme_gpu_stage_atom_data(sm_coefficients, gm_coefficients, 1);
+ pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsA, 1);
if (computeSplines)
{
/* Spreading */
if (spreadCharges)
{
- spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_grid);
+ spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_gridA);
+ }
+ if (numGrids == 2)
+ {
+ barrier(CLK_LOCAL_MEM_FENCE);
+ gridIndex = 1;
+ pme_gpu_stage_atom_data(sm_coefficients, gm_coefficientsB, 1);
+ barrier(CLK_LOCAL_MEM_FENCE);
+ if (spreadCharges)
+ {
+ spread_charges(kernelParams, sm_coefficients, sm_gridlineIndices, sm_theta, gm_gridB);
+ }
}
}
#include "pme_gpu_calculate_splines.cuh"
#include "pme_grid.h"
+/*
+ * This define affects the spline calculation behaviour in the kernel.
+ * 0: a single GPU thread handles a single dimension of a single particle (calculating and storing
+ * (order) spline values and derivatives). 1: (order) threads do redundant work on this same task,
+ * each one stores only a single theta and single dtheta into global arrays. The only efficiency
+ * difference is less global store operations, countered by more redundant spline computation.
+ *
+ * TODO: estimate if this should be a boolean parameter (and add it to the unit test if so).
+ */
+#define PME_GPU_PARALLEL_SPLINE 0
+
/*! \brief
* Charge spreading onto the grid.
* This corresponds to the CPU function spread_coefficients_bsplines_thread().
* \tparam[in] order PME interpolation order.
* \tparam[in] wrapX Whether the grid overlap in dimension X should be wrapped.
* \tparam[in] wrapY Whether the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] gridIndex The index of the grid to use in the kernel.
* \tparam[in] threadsPerAtom How many threads work on each atom
*
* \param[in] kernelParams Input PME CUDA data in constant memory.
* \param[in] sm_gridlineIndices Atom gridline indices in the shared memory.
* \param[in] sm_theta Atom spline values in the shared memory.
*/
-template<int order, bool wrapX, bool wrapY, ThreadsPerAtom threadsPerAtom>
+template<int order, bool wrapX, bool wrapY, int gridIndex, ThreadsPerAtom threadsPerAtom>
__device__ __forceinline__ void spread_charges(const PmeGpuCudaKernelParams kernelParams,
const float* atomCharge,
const int* __restrict__ sm_gridlineIndices,
const float* __restrict__ sm_theta)
{
/* Global memory pointer to the output grid */
- float* __restrict__ gm_grid = kernelParams.grid.d_realGrid;
+ float* __restrict__ gm_grid = kernelParams.grid.d_realGrid[gridIndex];
// Number of atoms processed by a single warp in spread and gather
const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
* \tparam[in] spreadCharges A boolean which tells if the charge spreading should be performed.
* \tparam[in] wrapX A boolean which tells if the grid overlap in dimension X should be wrapped.
* \tparam[in] wrapY A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam[in] numGrids The number of grids to use in the kernel. Can be 1 or 2.
* \tparam[in] writeGlobal A boolean which tells if the theta values and gridlines should be written to global memory.
* \tparam[in] threadsPerAtom How many threads work on each atom
* \param[in] kernelParams Input PME CUDA data in constant memory.
*/
-template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom>
__launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBUTE __global__
void pme_spline_and_spread_kernel(const PmeGpuCudaKernelParams kernelParams)
{
const int atomsPerWarp = warp_size / threadsPerAtomValue;
// Gridline indices, ivec
__shared__ int sm_gridlineIndices[atomsPerBlock * DIM];
+ // Charges
+ __shared__ float sm_coefficients[atomsPerBlock];
// Spline values
__shared__ float sm_theta[atomsPerBlock * DIM * order];
float dtheta;
/* Charges, required for both spline and spread */
if (c_useAtomDataPrefetch)
{
- __shared__ float sm_coefficients[atomsPerBlock];
- pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients, kernelParams.atoms.d_coefficients);
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
+ kernelParams.atoms.d_coefficients[0]);
__syncthreads();
atomCharge = sm_coefficients[atomIndexLocal];
}
else
{
- atomCharge = kernelParams.atoms.d_coefficients[atomIndexGlobal];
+ atomCharge = kernelParams.atoms.d_coefficients[0][atomIndexGlobal];
}
if (computeSplines)
/* Spreading */
if (spreadCharges)
{
- spread_charges<order, wrapX, wrapY, threadsPerAtom>(kernelParams, &atomCharge,
- sm_gridlineIndices, sm_theta);
+ spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(kernelParams, &atomCharge,
+ sm_gridlineIndices, sm_theta);
+ }
+ if (numGrids == 2)
+ {
+ __syncthreads();
+ if (c_useAtomDataPrefetch)
+ {
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
+ kernelParams.atoms.d_coefficients[1]);
+ __syncthreads();
+ atomCharge = sm_coefficients[atomIndexLocal];
+ }
+ else
+ {
+ atomCharge = kernelParams.atoms.d_coefficients[1][atomIndexGlobal];
+ }
+ if (spreadCharges)
+ {
+ spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(kernelParams, &atomCharge,
+ sm_gridlineIndices, sm_theta);
+ }
}
}
//! Kernel instantiations
// clang-format off
-template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
-template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 1, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 1, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::Order> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, true, ThreadsPerAtom::OrderSquared> (const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, false, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, false, true, true, true, 2, true, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
+template __global__ void pme_spline_and_spread_kernel<4, true, true, true, true, 2, false, ThreadsPerAtom::OrderSquared>(const PmeGpuCudaKernelParams);
// clang-format on
bool pmeSupportsInputForMode(const gmx_hw_info_t& hwinfo, const t_inputrec* inputRec, CodePath mode)
{
- bool implemented;
- gmx_mtop_t mtop;
+ bool implemented;
switch (mode)
{
case CodePath::CPU: implemented = true; break;
case CodePath::GPU:
implemented = (pme_gpu_supports_build(nullptr) && pme_gpu_supports_hardware(hwinfo, nullptr)
- && pme_gpu_supports_input(*inputRec, mtop, nullptr));
+ && pme_gpu_supports_input(*inputRec, nullptr));
break;
default: GMX_THROW(InternalError("Test not implemented for this mode"));
atc = &(pme->atc[0]);
atc->x = coordinates;
atc->coefficient = charges;
- gmx_pme_reinit_atoms(pme, atomCount, charges.data());
+ gmx_pme_reinit_atoms(pme, atomCount, charges.data(), nullptr);
/* With decomposition there would be more boilerplate atc code here, e.g. do_redist_pos_coeffs */
break;
atc = &(pme->atc[0]);
// We need to set atc->n for passing the size in the tests
atc->setNumAtoms(atomCount);
- gmx_pme_reinit_atoms(pme, atomCount, charges.data());
+ gmx_pme_reinit_atoms(pme, atomCount, charges.data(), nullptr);
stateGpu->reinit(atomCount, atomCount);
stateGpu->copyCoordinatesToGpu(arrayRefFromArray(coordinates.data(), coordinates.size()),
PmeAtomComm* atc = &(pme->atc[0]);
const size_t gridIndex = 0;
const bool computeSplinesForZeroCharges = true;
- real* fftgrid = spreadCharges ? pme->fftgrid[gridIndex] : nullptr;
+ real** fftgrid = spreadCharges ? pme->fftgrid : nullptr;
real* pmegrid = pme->pmegrid[gridIndex].grid.grid;
switch (mode)
{
case CodePath::CPU:
spread_on_grid(pme, atc, &pme->pmegrid[gridIndex], computeSplines, spreadCharges,
- fftgrid, computeSplinesForZeroCharges, gridIndex);
+ fftgrid != nullptr ? fftgrid[gridIndex] : nullptr,
+ computeSplinesForZeroCharges, gridIndex);
if (spreadCharges && !pme->bUseThreads)
{
wrap_periodic_pmegrid(pme, pmegrid);
- copy_pmegrid_to_fftgrid(pme, pmegrid, fftgrid, gridIndex);
+ copy_pmegrid_to_fftgrid(
+ pme, pmegrid, fftgrid != nullptr ? fftgrid[gridIndex] : nullptr, gridIndex);
}
break;
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
case CodePath::GPU:
{
+ const real lambdaQ = 1.0;
// no synchronization needed as x is transferred in the PME stream
GpuEventSynchronizer* xReadyOnDevice = nullptr;
- pme_gpu_spread(pme->gpu, xReadyOnDevice, gridIndex, fftgrid, computeSplines, spreadCharges);
+ pme_gpu_spread(pme->gpu, xReadyOnDevice, fftgrid, computeSplines, spreadCharges, lambdaQ);
}
break;
+#endif
default: GMX_THROW(InternalError("Test not implemented for this mode"));
}
t_complex* h_grid = pmeGetComplexGridInternal(pme);
const bool useLorentzBerthelot = false;
const size_t threadIndex = 0;
+ const size_t gridIndex = 0;
switch (mode)
{
case CodePath::CPU:
switch (method)
{
case PmeSolveAlgorithm::Coulomb:
- pme_gpu_solve(pme->gpu, h_grid, gridOrdering, computeEnergyAndVirial);
+ pme_gpu_solve(pme->gpu, gridIndex, h_grid, gridOrdering, computeEnergyAndVirial);
break;
default: GMX_THROW(InternalError("Test not implemented for this mode"));
const size_t threadIndex = 0;
const size_t gridIndex = 0;
real* pmegrid = pme->pmegrid[gridIndex].grid.grid;
- real* fftgrid = pme->fftgrid[gridIndex];
+ real** fftgrid = pme->fftgrid;
switch (mode)
{
// something which is normally done in serial spline computation (make_thread_local_ind())
atc->spline[threadIndex].n = atomCount;
}
- copy_fftgrid_to_pmegrid(pme, fftgrid, pmegrid, gridIndex, pme->nthread, threadIndex);
+ copy_fftgrid_to_pmegrid(pme, fftgrid[gridIndex], pmegrid, gridIndex, pme->nthread, threadIndex);
unwrap_periodic_pmegrid(pme, pmegrid);
gather_f_bsplines(pme, pmegrid, true, atc, &atc->spline[threadIndex], scale);
break;
+/* The compiler will complain about passing fftgrid (converting double ** to float **) if using
+ * double precision. GPUs are not used with double precision anyhow. */
+#if !GMX_DOUBLE
case CodePath::GPU:
{
// Variable initialization needs a non-switch scope
const bool computeEnergyAndVirial = false;
- PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial);
+ const real lambdaQ = 1.0;
+ PmeOutput output = pme_gpu_getOutput(*pme, computeEnergyAndVirial, lambdaQ);
GMX_ASSERT(forces.size() == output.forces_.size(),
"Size of force buffers did not match");
- pme_gpu_gather(pme->gpu, reinterpret_cast<float*>(fftgrid));
+ pme_gpu_gather(pme->gpu, fftgrid, lambdaQ);
std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
}
break;
+#endif
default: GMX_THROW(InternalError("Test not implemented for this mode"));
}
//! Getting the reciprocal energy and virial
PmeOutput pmeGetReciprocalEnergyAndVirial(const gmx_pme_t* pme, CodePath mode, PmeSolveAlgorithm method)
{
- PmeOutput output;
+ PmeOutput output;
+ const real lambdaQ = 1.0;
switch (mode)
{
case CodePath::CPU:
case CodePath::GPU:
switch (method)
{
- case PmeSolveAlgorithm::Coulomb: pme_gpu_getEnergyAndVirial(*pme, &output); break;
+ case PmeSolveAlgorithm::Coulomb:
+ pme_gpu_getEnergyAndVirial(*pme, lambdaQ, &output);
+ break;
default: GMX_THROW(InternalError("Test not implemented for this mode"));
}
gmx::AlignedAllocationPolicy::free(mdatoms_->invmass);
sfree(mdatoms_->invMassPerDim);
sfree(mdatoms_->typeA);
- sfree(mdatoms_->chargeB);
sfree(mdatoms_->typeB);
+ /* mdatoms->chargeA and mdatoms->chargeB point at chargeA_.data()
+ * and chargeB_.data() respectively. They get freed automatically. */
sfree(mdatoms_->sqrt_c6A);
sfree(mdatoms_->sigmaA);
sfree(mdatoms_->sigma3A);
sfree(mdatoms_->cU2);
}
-void MDAtoms::resize(int newSize)
+void MDAtoms::resizeChargeA(const int newSize)
{
chargeA_.resizeWithPadding(newSize);
mdatoms_->chargeA = chargeA_.data();
}
-void MDAtoms::reserve(int newCapacity)
+void MDAtoms::resizeChargeB(const int newSize)
+{
+ chargeB_.resizeWithPadding(newSize);
+ mdatoms_->chargeB = chargeB_.data();
+}
+
+void MDAtoms::reserveChargeA(const int newCapacity)
{
chargeA_.reserveWithPadding(newCapacity);
mdatoms_->chargeA = chargeA_.data();
}
+void MDAtoms::reserveChargeB(const int newCapacity)
+{
+ chargeB_.reserveWithPadding(newCapacity);
+ mdatoms_->chargeB = chargeB_.data();
+}
+
std::unique_ptr<MDAtoms> makeMDAtoms(FILE* fp, const gmx_mtop_t& mtop, const t_inputrec& ir, const bool rankHasPmeGpuTask)
{
auto mdAtoms = std::make_unique<MDAtoms>();
if (rankHasPmeGpuTask)
{
changePinningPolicy(&mdAtoms->chargeA_, pme_get_pinning_policy());
+ changePinningPolicy(&mdAtoms->chargeB_, pme_get_pinning_policy());
}
t_mdatoms* md;
snew(md, 1);
// everything, but for now the semantics of md->nalloc being
// the capacity are preserved by keeping vectors within
// mdAtoms having the same properties as the other arrays.
- mdAtoms->reserve(md->nalloc);
- mdAtoms->resize(md->nr);
+ mdAtoms->reserveChargeA(md->nalloc);
+ mdAtoms->resizeChargeA(md->nr);
+ if (md->nPerturbed > 0)
+ {
+ mdAtoms->reserveChargeB(md->nalloc);
+ mdAtoms->resizeChargeB(md->nr);
+ }
srenew(md->typeA, md->nalloc);
if (md->nPerturbed)
{
- srenew(md->chargeB, md->nalloc);
srenew(md->typeB, md->nalloc);
}
if (bLJPME)
unique_cptr<t_mdatoms> mdatoms_;
//! Memory for chargeA that can be set up for efficient GPU transfer.
gmx::PaddedHostVector<real> chargeA_;
+ //! Memory for chargeB that can be set up for efficient GPU transfer.
+ gmx::PaddedHostVector<real> chargeB_;
public:
// TODO make this private
t_mdatoms* mdatoms() { return mdatoms_.get(); }
//! Const getter.
const t_mdatoms* mdatoms() const { return mdatoms_.get(); }
- /*! \brief Resizes memory.
+ /*! \brief Resizes memory for charges of FEP state A.
*
* \throws std::bad_alloc If out of memory.
*/
- void resize(int newSize);
- /*! \brief Reserves memory.
+ void resizeChargeA(int newSize);
+ /*! \brief Resizes memory for charges of FEP state B.
*
* \throws std::bad_alloc If out of memory.
*/
- void reserve(int newCapacity);
+ void resizeChargeB(int newSize);
+ /*! \brief Reserves memory for charges of FEP state A.
+ *
+ * \throws std::bad_alloc If out of memory.
+ */
+ void reserveChargeA(int newCapacity);
+ /*! \brief Reserves memory for charges of FEP state B.
+ *
+ * \throws std::bad_alloc If out of memory.
+ */
+ void reserveChargeB(int newCapacity);
//! Builder function.
friend std::unique_ptr<MDAtoms>
makeMDAtoms(FILE* fp, const gmx_mtop_t& mtop, const t_inputrec& ir, bool rankHasPmeGpuTask);
* \param[in] pmedata The PME structure
* \param[in] box The box matrix
* \param[in] stepWork Step schedule flags
- * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in
- * the device memory. \param[in] wcycle The wallcycle structure
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory.
+ * \param[in] lambdaQ The Coulomb lambda of the current state.
+ * \param[in] wcycle The wallcycle structure
*/
static inline void launchPmeGpuSpread(gmx_pme_t* pmedata,
const matrix box,
const StepWorkload& stepWork,
GpuEventSynchronizer* xReadyOnDevice,
+ const real lambdaQ,
gmx_wallcycle_t wcycle)
{
pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
- pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle);
+ pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
}
/*! \brief Launch the FFT and gather stages of PME GPU
* This function only implements setting the output forces (no accumulation).
*
* \param[in] pmedata The PME structure
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
* \param[in] wcycle The wallcycle structure
* \param[in] stepWork Step schedule flags
*/
-static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata, gmx_wallcycle_t wcycle, const gmx::StepWorkload& stepWork)
+static void launchPmeGpuFftAndGather(gmx_pme_t* pmedata,
+ const real lambdaQ,
+ gmx_wallcycle_t wcycle,
+ const gmx::StepWorkload& stepWork)
{
pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
- pme_gpu_launch_gather(pmedata, wcycle);
+ pme_gpu_launch_gather(pmedata, wcycle, lambdaQ);
}
/*! \brief
* \param[in,out] pmedata PME module data
* \param[in,out] forceOutputs Output buffer for the forces and virial
* \param[in,out] enerd Energy data structure results are reduced into
+ * \param[in] lambdaQ The Coulomb lambda of the current system state.
* \param[in] stepWork Step schedule flags
* \param[in] wcycle The wallcycle structure
*/
gmx_pme_t* pmedata,
gmx::ForceOutputs* forceOutputs,
gmx_enerdata_t* enerd,
+ const real lambdaQ,
const StepWorkload& stepWork,
gmx_wallcycle_t wcycle)
{
GpuTaskCompletion completionType =
(isNbGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
isPmeGpuDone = pme_gpu_try_finish_task(pmedata, stepWork, wcycle, &forceWithVirial,
- enerd, completionType);
+ enerd, lambdaQ, completionType);
}
if (!isNbGpuDone)
if (useGpuPmeOnThisRank)
{
- launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, wcycle);
+ launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle);
}
/* do gridding for pair search */
// X copy/transform to allow overlap as well as after the GPU NB
// launch to avoid FFT launch overhead hijacking the CPU and delaying
// the nonbonded kernel.
- launchPmeGpuFftAndGather(fr->pmedata, wcycle, stepWork);
+ launchPmeGpuFftAndGather(fr->pmedata, lambda[efptCOUL], wcycle, stepWork);
}
/* Communicate coordinates and sum dipole if necessary +
&& !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
if (alternateGpuWait)
{
- alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, stepWork, wcycle);
+ alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd, lambda[efptCOUL],
+ stepWork, wcycle);
}
if (!alternateGpuWait && useGpuPmeOnThisRank)
{
- pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd);
+ pme_gpu_wait_and_reduce(fr->pmedata, stepWork, wcycle, &forceOut.forceWithVirial(), enerd,
+ lambda[efptCOUL]);
}
/* Wait for local GPU NB outputs on the non-alternating wait path */
hw_opt.nthreads_tmpi);
useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
- *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+ *inputrec, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
useGpuForPme = decideWhetherToUseGpusForPme(
- useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec, mtop,
+ useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec,
cr->sizeOfDefaultCommunicator, domdecOptions.numPmeRanks, gpusWereDetected);
auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr)
&& inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
if (EEL_PME(fr->ic->eeltype))
{
- gmx_pme_reinit_atoms(fr->pmedata, a_tp0, nullptr);
+ gmx_pme_reinit_atoms(fr->pmedata, a_tp0, nullptr, nullptr);
}
/* With reacion-field we have distance dependent potentials
const std::vector<int>& userGpuTaskAssignment,
const gmx_hw_info_t& hardwareInfo,
const t_inputrec& inputrec,
- const gmx_mtop_t& mtop,
const int numRanksPerSimulation,
const int numPmeRanksPerSimulation)
{
// First, exclude all cases where we can't run PME on GPUs.
if ((pmeTarget == TaskTarget::Cpu) || !useGpuForNonbonded || !pme_gpu_supports_build(nullptr)
- || !pme_gpu_supports_hardware(hardwareInfo, nullptr)
- || !pme_gpu_supports_input(inputrec, mtop, nullptr))
+ || !pme_gpu_supports_hardware(hardwareInfo, nullptr) || !pme_gpu_supports_input(inputrec, nullptr))
{
// PME can't run on a GPU. If the user required that, we issue
// an error later.
const std::vector<int>& userGpuTaskAssignment,
const gmx_hw_info_t& hardwareInfo,
const t_inputrec& inputrec,
- const gmx_mtop_t& mtop,
const int numRanksPerSimulation,
const int numPmeRanksPerSimulation,
const bool gpusWereDetected)
}
return false;
}
- if (!pme_gpu_supports_input(inputrec, mtop, &message))
+ if (!pme_gpu_supports_input(inputrec, &message))
{
if (pmeTarget == TaskTarget::Gpu)
{
* \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs.
* \param[in] hardwareInfo Hardware information
* \param[in] inputrec The user input
- * \param[in] mtop Global system topology
* \param[in] numRanksPerSimulation The number of ranks in each simulation.
* \param[in] numPmeRanksPerSimulation The number of PME ranks in each simulation.
*
const std::vector<int>& userGpuTaskAssignment,
const gmx_hw_info_t& hardwareInfo,
const t_inputrec& inputrec,
- const gmx_mtop_t& mtop,
int numRanksPerSimulation,
int numPmeRanksPerSimulation);
* \param[in] userGpuTaskAssignment The user-specified assignment of GPU tasks to device IDs.
* \param[in] hardwareInfo Hardware information
* \param[in] inputrec The user input
- * \param[in] mtop Global system topology
* \param[in] numRanksPerSimulation The number of ranks in each simulation.
* \param[in] numPmeRanksPerSimulation The number of PME ranks in each simulation.
* \param[in] gpusWereDetected Whether compatible GPUs were detected on any node.
const std::vector<int>& userGpuTaskAssignment,
const gmx_hw_info_t& hardwareInfo,
const t_inputrec& inputrec,
- const gmx_mtop_t& mtop,
int numRanksPerSimulation,
int numPmeRanksPerSimulation,
bool gpusWereDetected);
GMX_RELEASE_ASSERT((EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype))
&& pme_gpu_supports_build(nullptr)
&& pme_gpu_supports_hardware(*hwinfo, nullptr)
- && pme_gpu_supports_input(*inputrec, *mtop, nullptr),
+ && pme_gpu_supports_input(*inputrec, nullptr),
"PME can't be on GPUs unless we are using PME");
// PME on GPUs supports a single PME rank with PP running on the same or few other ranks.