Enable with GMX_USE_GPU_BUFFER_OPS env variable.
Provides functionality to perform reduction of PME forces in F buffer
ops kernel. Currently active when single GPU performs both PME and PP
(multi-GPU support will follow in patch which perfoms PME/PP comms
direct between GPUs). When active, Device->Host copy of PME force
and CPU-side reduction is disabled.
Implements part of #3029, refs #2817
Change-Id: I3e66b6919c1e86bf0bed42b74136f8694626910b
enum class GpuTaskCompletion;
class PmeGpuProgram;
+class GpuEventSynchronizer;
//! Convenience name.
using PmeGpuProgramHandle = const PmeGpuProgram *;
* \param[in] forceTreatment Tells how data should be treated. The gathering kernel either stores
* the output reciprocal forces into the host array, or copies its contents to the GPU first
* and accumulates. The reduction is non-atomic.
+ * \param[in] useGpuFPmeReduction Whether PME forces are reduced on GPU
*/
GPU_FUNC_QUALIFIER void pme_gpu_launch_gather(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle),
- PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment)) GPU_FUNC_TERM;
+ PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment),
+ bool GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
/*! \brief
* Attempts to complete PME GPU tasks.
* \param[in] wcycle The wallclock counter.
* \param[out] forceWithVirial The output force and virial
* \param[out] enerd The output energies
+ * \param[in] useGpuFPmeReduction Whether PME forces are reduced on GPU
*/
GPU_FUNC_QUALIFIER void
pme_gpu_wait_and_reduce(gmx_pme_t *GPU_FUNC_ARGUMENT(pme),
int GPU_FUNC_ARGUMENT(flags),
gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle),
gmx::ForceWithVirial *GPU_FUNC_ARGUMENT(forceWithVirial),
- gmx_enerdata_t *GPU_FUNC_ARGUMENT(enerd)) GPU_FUNC_TERM;
+ gmx_enerdata_t *GPU_FUNC_ARGUMENT(enerd),
+ bool GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
/*! \brief
* The PME GPU reinitialization function that is called both at the end of any PME computation and on any load balancing.
gmx_wallcycle *GPU_FUNC_ARGUMENT(wcycle)) GPU_FUNC_TERM;
-/*! \brief Get pointer to device copy of coordinate data. */
+/*! \brief Get pointer to device copy of coordinate data.
+ * \param[in] pme The PME data structure.
+ * \returns Pointer to coordinate data
+ */
GPU_FUNC_QUALIFIER void *pme_gpu_get_device_x(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Get pointer to device copy of force data.
+ * \param[in] pme The PME data structure.
+ * \returns Pointer to force data
+ */
+GPU_FUNC_QUALIFIER void *pme_gpu_get_device_f(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Get pointer to the device synchronizer object that allows syncing on PME force calculation completion
+ * \param[in] pme The PME data structure.
+ * \returns Pointer to sychronizer
+ */
+GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *GPU_FUNC_ARGUMENT(pme)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
#endif
void pme_gpu_launch_gather(const gmx_pme_t *pme,
gmx_wallcycle gmx_unused *wcycle,
- PmeForceOutputHandling forceTreatment)
+ PmeForceOutputHandling forceTreatment,
+ bool useGpuFPmeReduction)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
const unsigned int gridIndex = 0;
real *fftgrid = pme->fftgrid[gridIndex];
- pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid));
+ pme_gpu_gather(pme->gpu, forceTreatment, reinterpret_cast<float *>(fftgrid), useGpuFPmeReduction);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
const PmeOutput &output,
gmx_wallcycle *wcycle,
gmx::ForceWithVirial *forceWithVirial,
- gmx_enerdata_t *enerd)
+ gmx_enerdata_t *enerd,
+ bool useGpuFPmeReduction)
{
wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
GMX_ASSERT(forceWithVirial, "Invalid force pointer");
forceWithVirial->addVirialContribution(output.coulombVirial_);
enerd->term[F_COUL_RECIP] += output.coulombEnergy_;
}
- sum_forces(forceWithVirial->force_, output.forces_);
+ if (!useGpuFPmeReduction)
+ {
+ sum_forces(forceWithVirial->force_, output.forces_);
+ }
wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
}
PmeOutput output = pme_gpu_getOutput(*pme, flags);
wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
- pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd);
+ pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd, false);
return true;
}
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+
// Synchronize the whole PME stream at once, including D2H result transfers.
+ // TODO: make this sync conditional with useGpuFPmeReduction to wait only for virial/energies
pme_gpu_synchronize(pme->gpu);
pme_gpu_update_timings(pme->gpu);
const int flags,
gmx_wallcycle *wcycle,
gmx::ForceWithVirial *forceWithVirial,
- gmx_enerdata_t *enerd)
+ gmx_enerdata_t *enerd,
+ bool useGpuFPmeReduction)
{
PmeOutput output = pme_gpu_wait_finish_task(pme, flags, wcycle);
- pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd);
+ pme_gpu_reduce_outputs(flags, output, wcycle, forceWithVirial, enerd, useGpuFPmeReduction);
}
void pme_gpu_reinit_computation(const gmx_pme_t *pme,
}
return pme_gpu_get_kernelparam_coordinates(pme->gpu);
}
+
+void *pme_gpu_get_device_f(const gmx_pme_t *pme)
+{
+ if (!pme || !pme_gpu_active(pme))
+ {
+ return nullptr;
+ }
+ return pme_gpu_get_kernelparam_forces(pme->gpu);
+}
+
+GpuEventSynchronizer * pme_gpu_get_f_ready_synchronizer(const gmx_pme_t *pme)
+{
+ if (!pme || !pme_gpu_active(pme))
+ {
+ return nullptr;
+ }
+
+ return pme_gpu_get_forces_ready_synchronizer(pme->gpu);
+}
// FIXME: sync required since the copied data will be used by PP stream when using single GPU for both
// Remove after adding the required event-based sync between the above H2D and the transform kernel
pme_gpu_synchronize(pmeGpu);
-
#endif
}
void pme_gpu_gather(PmeGpu *pmeGpu,
PmeForceOutputHandling forceTreatment,
- const float *h_grid
+ const float *h_grid,
+ bool useGpuFPmeReduction
)
{
/* Copying the input CPU forces for reduction */
launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
pme_gpu_stop_timing(pmeGpu, timingId);
- pme_gpu_copy_output_forces(pmeGpu);
+ if (useGpuFPmeReduction)
+ {
+ pmeGpu->archSpecific->pmeForcesReady.markEvent(pmeGpu->archSpecific->pmeStream);
+ }
+ else
+ {
+ pme_gpu_copy_output_forces(pmeGpu);
+ }
}
void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *pmeGpu)
{
return nullptr;
}
+}
+void * pme_gpu_get_kernelparam_forces(const PmeGpu *pmeGpu)
+{
+ if (pmeGpu && pmeGpu->kernelParams)
+ {
+ return pmeGpu->kernelParams->atoms.d_forces;
+ }
+ else
+ {
+ return nullptr;
+ }
+}
+
+GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *pmeGpu)
+{
+ if (pmeGpu && pmeGpu->kernelParams)
+ {
+ return &pmeGpu->archSpecific->pmeForcesReady;
+ }
+ else
+ {
+ return nullptr;
+ }
}
* \param[in] forceTreatment Tells how data in h_forces should be treated.
* TODO: determine efficiency/balance of host/device-side reductions.
* \param[in] h_grid The host-side grid buffer (used only in testing mode)
+ * \param[in] useGpuFPmeReduction Whether forces are reduced on GPU
*/
GPU_FUNC_QUALIFIER void pme_gpu_gather(PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu),
PmeForceOutputHandling GPU_FUNC_ARGUMENT(forceTreatment),
- const float *GPU_FUNC_ARGUMENT(h_grid)) GPU_FUNC_TERM;
+ const float *GPU_FUNC_ARGUMENT(h_grid),
+ bool GPU_FUNC_ARGUMENT(useGpuFPmeReduction)) GPU_FUNC_TERM;
-/*! \brief Return pointer to device copy of coordinate data. */
+/*! \brief Return pointer to device copy of coordinate data.
+ * \param[in] pmeGpu The PME GPU structure.
+ * \returns Pointer to coordinate data
+ */
GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_coordinates(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+/*! \brief Return pointer to device copy of force data.
+ * \param[in] pmeGpu The PME GPU structure.
+ * \returns Pointer to force data
+ */
+GPU_FUNC_QUALIFIER void * pme_gpu_get_kernelparam_forces(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
+/*! \brief Return pointer to the sync object triggered after the PME force calculation completion
+ * \param[in] pmeGpu The PME GPU structure.
+ * \returns Pointer to sync object
+ */
+GPU_FUNC_QUALIFIER GpuEventSynchronizer *pme_gpu_get_forces_ready_synchronizer(const PmeGpu *GPU_FUNC_ARGUMENT(pmeGpu)) GPU_FUNC_TERM_WITH_RETURN(nullptr);
+
/* The inlined convenience PME GPU status getters */
/*! \libinternal \brief
Context context;
/* Synchronization events */
+ /*! \brief Triggered after the PME Force Calculations have been completed */
+ GpuEventSynchronizer pmeForcesReady;
/*! \brief Triggered after the grid has been copied to the host (after the spreading stage). */
GpuEventSynchronizer syncSpreadGridD2H;
pme_gpu_prepare_computation(pme, boxChanged, box, wcycle, pmeFlags);
pme_gpu_launch_spread(pme, as_rvec_array(pme_pp->x.data()), wcycle);
pme_gpu_launch_complex_transforms(pme, wcycle);
- pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set);
+ pme_gpu_launch_gather(pme, wcycle, PmeForceOutputHandling::Set, false);
output = pme_gpu_wait_finish_task(pme, pmeFlags, wcycle);
pme_gpu_reinit_computation(pme, wcycle);
}
{
std::copy(std::begin(forces), std::end(forces), std::begin(output.forces_));
}
- pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid));
+ pme_gpu_gather(pme->gpu, inputTreatment, reinterpret_cast<float *>(fftgrid), false);
std::copy(std::begin(output.forces_), std::end(output.forces_), std::begin(forces));
}
break;
-/*
+/* x
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
*
* \param[in] pmedata The PME structure
* \param[in] wcycle The wallcycle structure
+ * \param[in] useGpuFPmeReduction Whether forces will be reduced on GPU
*/
static void launchPmeGpuFftAndGather(gmx_pme_t *pmedata,
- gmx_wallcycle_t wcycle)
+ gmx_wallcycle_t wcycle,
+ bool useGpuFPmeReduction)
{
pme_gpu_launch_complex_transforms(pmedata, wcycle);
- pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set);
+ pme_gpu_launch_gather(pmedata, wcycle, PmeForceOutputHandling::Set, useGpuFPmeReduction);
}
/*! \brief
wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
- as_rvec_array(force->unpaddedArrayRef().data()),
- BufferOpsUseGpu::False,
- GpuBufferOpsAccumulateForce::Null);
+ as_rvec_array(force->unpaddedArrayRef().data()));
}
}
}
((flags & GMX_FORCE_VIRIAL) ? GMX_PME_CALC_ENER_VIR : 0) |
((flags & GMX_FORCE_ENERGY) ? GMX_PME_CALC_ENER_VIR : 0) |
((flags & GMX_FORCE_FORCES) ? GMX_PME_CALC_F : 0);
- const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
- && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ?
- BufferOpsUseGpu::True : BufferOpsUseGpu::False;
+ // Switches on whether to use GPU for position and force buffer operations
+ // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
const BufferOpsUseGpu useGpuXBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA)) ?
BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
+ // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
+ const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
+ && !(flags & (GMX_FORCE_VIRIAL | GMX_FORCE_ENERGY)) ?
+ BufferOpsUseGpu::True : BufferOpsUseGpu::False;
+ // TODO: move / add this flag to the internal PME GPU data structures
+ const bool useGpuFPmeReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
+ thisRankHasDuty(cr, DUTY_PME) && useGpuPme; // only supported if this rank is perfoming PME on the GPU
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
// X copy/transform to allow overlap as well as after the GPU NB
// launch to avoid FFT launch overhead hijacking the CPU and delaying
// the nonbonded kernel.
- launchPmeGpuFftAndGather(fr->pmedata, wcycle);
+ launchPmeGpuFftAndGather(fr->pmedata, wcycle, useGpuFPmeReduction);
}
/* Communicate coordinates and sum dipole if necessary +
* communication with calculation with domain decomposition.
*/
wallcycle_stop(wcycle, ewcFORCE);
- nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f(),
- BufferOpsUseGpu::False,
- GpuBufferOpsAccumulateForce::Null);
+ nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::All, forceOut.f());
wallcycle_start_nocount(wcycle, ewcFORCE);
flags, &forceOut.forceWithVirial(), enerd,
ed, bNS);
- // flag to specify if CPU force output is preset in force
- // buffer. For now, this is true even when useGpuPme == true
- // (because on-GPU PME-nonbonded reduction will be added in
- // follow-up)
- // TODO adapt the below when on-GPU PME-nonbonded reduction is available.
- bool useCpuPmeReduction = true;
- bool haveCpuForces = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuPmeReduction);
- // flag to specify if forces should be accumulated in force buffer
- // ops. For now, this is solely determined by above haveCpuForces
- // flag, but in future developments it will also depend on
- // e.g. whether the GPU force halo exchange is active.
- GpuBufferOpsAccumulateForce accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
- haveCpuForces ? GpuBufferOpsAccumulateForce::True :
- GpuBufferOpsAccumulateForce::False;
+ bool useCpuFPmeReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuFPmeReduction;
+ bool haveCpuForces = (ppForceWorkload->haveSpecialForces || ppForceWorkload->haveCpuListedForceWork || useCpuFPmeReduction);
// Will store the amount of cycles spent waiting for the GPU that
// will be later used in the DLB accounting.
{
nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal);
}
+
+ // flag to specify if forces should be accumulated in force buffer
+ // ops. For non-local part, this just depends on whether CPU forces are present.
+ bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) && haveCpuForces;
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::NonLocal,
- forceOut.f(), useGpuFBufOps, accumulateForce);
+ forceOut.f(), pme_gpu_get_device_f(fr->pmedata),
+ pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+ useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::NonLocal);
if (!alternateGpuWait && useGpuPme)
{
- pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd);
+ pme_gpu_wait_and_reduce(fr->pmedata, pmeFlags, wcycle, &forceOut.forceWithVirial(), enerd, useGpuFPmeReduction);
}
/* Wait for local GPU NB outputs on the non-alternating wait path */
// - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
// before the next CPU task that consumes the forces: vsite spread or update)
//
- if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
+ if (useGpuFBufOps == BufferOpsUseGpu::True && (haveCpuForces || DOMAINDECOMP(cr)))
{
nbv->launch_copy_f_to_gpu(forceOut.f(), Nbnxm::AtomLocality::Local);
}
+ // flag to specify if forces should be accumulated in force
+ // buffer ops. For local part, this depends on whether CPU
+ // forces are present, or if DD is active (in which case the
+ // halo exchange has resulted in contributions from the
+ // non-local part).
+ bool accumulateForce = (useGpuFBufOps == BufferOpsUseGpu::True) &&
+ (haveCpuForces || DOMAINDECOMP(cr));
nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
- forceOut.f(), useGpuFBufOps, accumulateForce);
+ forceOut.f(), pme_gpu_get_device_f(fr->pmedata),
+ pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+ useGpuFBufOps, useGpuFPmeReduction, accumulateForce);
+
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f(), Nbnxm::AtomLocality::Local);
/* Add the force array(s) from nbnxn_atomdata_t to f */
template <bool useGpu>
-void reduceForces(nbnxn_atomdata_t *nbat,
- const Nbnxm::AtomLocality locality,
- const Nbnxm::GridSet &gridSet,
- rvec *f,
- gmx_nbnxn_gpu_t *gpu_nbv,
- GpuBufferOpsAccumulateForce accumulateForce)
+void reduceForces(nbnxn_atomdata_t *nbat,
+ const Nbnxm::AtomLocality locality,
+ const Nbnxm::GridSet &gridSet,
+ rvec *f,
+ void *pmeFDeviceBuffer,
+ GpuEventSynchronizer *pmeForcesReady,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ bool useGpuFPmeReduction,
+ bool accumulateForce)
{
int a0 = 0;
int na = 0;
{
Nbnxm::nbnxn_gpu_add_nbat_f_to_f(locality,
gpu_nbv,
+ pmeFDeviceBuffer,
+ pmeForcesReady,
a0, na,
+ useGpuFPmeReduction,
accumulateForce);
-
}
else
{
const Nbnxm::AtomLocality locality,
const Nbnxm::GridSet &gridSet,
rvec *f,
+ void *fpme,
+ GpuEventSynchronizer *pmeForcesReady,
gmx_nbnxn_gpu_t *gpu_nbv,
- GpuBufferOpsAccumulateForce accumulateForce);
+ bool useGpuFPmeReduction,
+ bool accumulateForce);
template
void reduceForces<false>(nbnxn_atomdata_t *nbat,
const Nbnxm::AtomLocality locality,
const Nbnxm::GridSet &gridSet,
rvec *f,
+ void *fpme,
+ GpuEventSynchronizer *pmeForcesReady,
gmx_nbnxn_gpu_t *gpu_nbv,
- GpuBufferOpsAccumulateForce accumulateForce);
+ bool useGpuFPmeReduction,
+ bool accumulateForce);
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
rvec *fshift)
enum class BufferOpsUseGpu;
+class GpuEventSynchronizer;
+
namespace Nbnxm
{
class GridSet;
enum class KernelType;
}
-enum class GpuBufferOpsAccumulateForce;
-
/* Convenience type for vector with aligned memory */
template<typename T>
using AlignedVector = std::vector < T, gmx::AlignedAllocator < T>>;
Nbnxm::AtomLocality locality,
const Nbnxm::GridSet &gridSet,
rvec *f,
+ void *pmeFDeviceBuffer,
+ GpuEventSynchronizer *pmeForcesReady,
gmx_nbnxn_gpu_t *gpu_nbv,
- GpuBufferOpsAccumulateForce accumulateForce);
+ bool useGpuFPmeReduction,
+ bool accumulateForce);
extern template
const Nbnxm::AtomLocality locality,
const Nbnxm::GridSet &gridSet,
rvec *f,
+ void *pmeFDeviceBuffer,
+ GpuEventSynchronizer *pmeForcesReady,
gmx_nbnxn_gpu_t *gpu_nbv,
- GpuBufferOpsAccumulateForce accumulateForce);
+ bool useGpuFPmeReduction,
+ bool accumulateForce);
extern template
void reduceForces<false>(nbnxn_atomdata_t *nbat,
const Nbnxm::AtomLocality locality,
const Nbnxm::GridSet &gridSet,
rvec *f,
+ void *pmeFDeviceBuffer,
+ GpuEventSynchronizer *pmeForcesReady,
gmx_nbnxn_gpu_t *gpu_nbv,
- GpuBufferOpsAccumulateForce accumulateForce);
+ bool useGpuFPmeReduction,
+ bool accumulateForce);
/* Add the fshift force stored in nbat to fshift */
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t *nbat,
/*! \brief CUDA kernel to add part of the force array(s) from nbnxn_atomdata_t to f
*
- * \param[in] fnb Force in nbat format
- * \param[in,out] f Force buffer to be reduced into
- * \param[in] cell Cell index mapping
- * \param[in] a0 start atom index
- * \param[in] a1 end atom index
- * \param[in] stride stride between atoms in memory
+ * \param[in] fnb Force in nbat format
+ * \param[in] fPmeDeviceBuffer PME force
+ * \param[in,out] f Force buffer to be reduced into
+ * \param[in] cell Cell index mapping
+ * \param[in] atomStart Start atom index
+ * \param[in] nAtoms Number of Atoms
*/
-template <bool accumulateForce>
+template <bool accumulateForce, bool addPmeF>
__global__ void
nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
- rvec * f,
+ const float3 *__restrict__ fPmeDeviceBuffer,
+ float3 * f,
const int *__restrict__ cell,
const int atomStart,
const int nAtoms);
-template <bool accumulateForce>
+template <bool accumulateForce, bool addPmeF>
__global__ void
nbnxn_gpu_add_nbat_f_to_f_kernel(const float3 *__restrict__ fnb,
- rvec * f,
+ const float3 *__restrict__ fPmeDeviceBuffer,
+ float3 * f,
const int *__restrict__ cell,
const int atomStart,
const int nAtoms)
{
int i = cell[atomStart+threadIndex];
- float3 *f_dest = (float3 *)&f[atomStart+threadIndex][XX];
+ float3 *fDest = (float3 *)&f[atomStart+threadIndex];
+ float3 temp;
if (accumulateForce)
{
- *f_dest += fnb[i];
+ temp = *fDest;
+ temp += fnb[i];
}
else
{
- *f_dest = fnb[i];
+ temp = fnb[i];
}
+ if (addPmeF)
+ {
+ temp += fPmeDeviceBuffer[atomStart+threadIndex];
+ }
+ *fDest = temp;
}
return;
#include "nbnxm_cuda.h"
#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
#include "gromacs/gpu_utils/vectype_ops.cuh"
#include "gromacs/mdlib/force_flags.h"
#include "gromacs/nbnxm/atomdata.h"
}
/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
-void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
- gmx_nbnxn_gpu_t *nb,
- int atomStart,
- int nAtoms,
- GpuBufferOpsAccumulateForce accumulateForce)
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
+ gmx_nbnxn_gpu_t *nb,
+ void *fPmeDevicePtr,
+ GpuEventSynchronizer *pmeForcesReady,
+ int atomStart,
+ int nAtoms,
+ bool useGpuFPmeReduction,
+ bool accumulateForce)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
cudaStream_t stream = nb->stream[iLocality];
+ cu_atomdata_t *adat = nb->atdat;
+ bool addPmeF = useGpuFPmeReduction;
- cu_atomdata_t *adat = nb->atdat;
+ if (addPmeF)
+ {
+ //Stream must wait for PME force completion
+ pmeForcesReady->enqueueWaitEvent(stream);
+ }
/* launch kernel */
config.sharedMemorySize = 0;
config.stream = stream;
- auto kernelFn = (accumulateForce == GpuBufferOpsAccumulateForce::True) ?
- nbnxn_gpu_add_nbat_f_to_f_kernel<true> : nbnxn_gpu_add_nbat_f_to_f_kernel<false>;
- const float3 *fPtr = adat->f;
- rvec *frvec = nb->frvec;
- const int *cell = nb->cell;
+ auto kernelFn = accumulateForce ?
+ nbnxn_gpu_add_nbat_f_to_f_kernel<true, false> :
+ nbnxn_gpu_add_nbat_f_to_f_kernel<false, false>;
+
+ if (addPmeF)
+ {
+ kernelFn = accumulateForce ?
+ nbnxn_gpu_add_nbat_f_to_f_kernel<true, true> :
+ nbnxn_gpu_add_nbat_f_to_f_kernel<false, true>;
+ }
+
+ const float3 *d_f = adat->f;
+ float3 *d_fNB = (float3*) nb->frvec;
+ const float3 *d_fPme = (float3*) fPmeDevicePtr;
+ const int *d_cell = nb->cell;
const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config,
- &fPtr,
- &frvec,
- &cell,
+ &d_f,
+ &d_fPme,
+ &d_fNB,
+ &d_cell,
&atomStart,
&nAtoms);
return pairSearch_->gridSet().cells();
}
+void
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality,
+ rvec *f)
+{
+
+ /* Skip the reduction if there was no short-range GPU work to do
+ * (either NB or both NB and bonded work). */
+ if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
+ {
+ return;
+ }
+
+ wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
+ wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
+
+ reduceForces<false>(nbat.get(), locality, pairSearch_->gridSet(), f, nullptr, nullptr, gpu_nbv, false, false);
+
+ wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
+ wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+}
+
void
nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const Nbnxm::AtomLocality locality,
rvec *f,
+ void *fPmeDeviceBuffer,
+ GpuEventSynchronizer *pmeForcesReady,
BufferOpsUseGpu useGpu,
- GpuBufferOpsAccumulateForce accumulateForce)
+ bool useGpuFPmeReduction,
+ bool accumulateForce)
{
- GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) &&
- (accumulateForce == GpuBufferOpsAccumulateForce::True)),
+ GMX_ASSERT(!((useGpu == BufferOpsUseGpu::False) && accumulateForce),
"Accumulatation of force is only valid when GPU buffer ops are active");
+ GMX_ASSERT((useGpuFPmeReduction == (fPmeDeviceBuffer != nullptr)),
+ "GPU PME force reduction is only valid when a non-null GPU PME force pointer is available");
+
/* Skip the reduction if there was no short-range GPU work to do
* (either NB or both NB and bonded work). */
if (!pairlistIsSimple() && !haveGpuShortRangeWork(locality))
wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
auto fn = useGpu == BufferOpsUseGpu::True ? reduceForces<true> : reduceForces<false>;
- fn(nbat.get(), locality, pairSearch_->gridSet(), f, gpu_nbv, accumulateForce);
+ fn(nbat.get(), locality, pairSearch_->gridSet(), f, fPmeDeviceBuffer, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
False
};
-/*! \brief Switch for whether forces should accumulate in GPU buffer ops */
-enum class GpuBufferOpsAccumulateForce
-{
- True, // Force should be accumulated and format converted
- False, // Force should be not accumulated, just format converted
- Null // GPU buffer ops are not in use, so this object is not applicable
-};
-
+class GpuEventSynchronizer;
namespace gmx
{
int forceFlags,
t_nrnb *nrnb);
- //! Add the forces stored in nbat to f, zeros the forces in nbat */
+ /*! \brief Add the forces stored in nbat to f, zeros the forces in nbat
+ * \param [in] locality Local or non-local
+ * \param [inout] f Force to be added to
+ */
+ void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality,
+ rvec *f);
+
+ /*! \brief Add the forces stored in nbat to f, allowing for possibility that GPU buffer ops are active
+ * \param [in] locality Local or non-local
+ * \param [inout] f Force to be added to
+ * \param [in] fPme Force from PME calculation
+ * \param [in] pmeForcesReady Event triggered when PME force calculation has completed
+ * \param [in] useGpu Whether GPU buffer ops are active
+ * \param [in] useGpuFPmeReduction Whether PME force reduction is on GPU
+ * \param [in] accumulateForce Whether force should be accumulated or stored
+ */
void atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality locality,
rvec *f,
+ void *fPme,
+ GpuEventSynchronizer *pmeForcesReady,
BufferOpsUseGpu useGpu,
- GpuBufferOpsAccumulateForce accumulateForce);
+ bool useGpuFPmeReduction,
+ bool accumulateForce);
/*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
void atomdata_init_add_nbat_f_to_f_gpu();
struct nbnxn_atomdata_t;
enum class GpuTaskCompletion;
-enum class GpuBufferOpsAccumulateForce;
namespace gmx
{
CUDA_FUNC_QUALIFIER
void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality gmx_unused atomLocality,
gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ void gmx_unused *fPmeDevicePtr,
+ GpuEventSynchronizer gmx_unused *pmeForcesReady,
int gmx_unused atomStart,
int gmx_unused nAtoms,
- GpuBufferOpsAccumulateForce gmx_unused accumulateForce) CUDA_FUNC_TERM;
+ bool gmx_unused useGpuFPmeReduction,
+ bool gmx_unused accumulateForce) CUDA_FUNC_TERM;
/*! \brief Copy force buffer from CPU to GPU */
CUDA_FUNC_QUALIFIER