if (useGpuFBufOps == BufferOpsUseGpu::True)
{
+ std::vector<GpuEventSynchronizer*> dependencyList;
+ dependencyList.reserve(2);
+
+ if (useGpuPmeFReduction)
+ {
+ dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
+ }
+
// TODO: move this into DomainLifetimeWorkload, including the second part of the condition
// The bonded and free energy CPU tasks can have non-local force contributions
// which are a dependency for the GPU force reduction.
if (haveNonLocalForceContribInCpuBuffer)
{
stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
+ dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::NonLocal));
}
+
+ //
+ // FIXME: are we adding a PME->nonlocal dep here?
+ //
nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::NonLocal,
stateGpu->getForces(),
pme_gpu_get_device_f(fr->pmedata),
- pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+ dependencyList,
useGpuPmeFReduction, haveNonLocalForceContribInCpuBuffer);
stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
* on the non-alternating path. */
if (bUseOrEmulGPU && !alternateGpuWait)
{
+ std::vector<GpuEventSynchronizer*> dependencyList;
+ dependencyList.reserve(2);
+
+ if (useGpuPmeFReduction)
+ {
+ dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
+ }
+
gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
if (useGpuFBufOps == BufferOpsUseGpu::True)
if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
{
stateGpu->copyForcesToGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
+ dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(gmx::StatePropagatorDataGpu::AtomLocality::Local));
}
if (useGpuForcesHaloExchange)
{
// for the local buffer ops on the result of GPU halo
// exchange, which operates in the non-local stream and
// writes to to local parf og the force buffer.
+ //
// TODO improve this through use of an event - see Redmine #3093
+ // push the event into the dependencyList
nbv->stream_local_wait_for_nonlocal();
}
nbv->atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality::Local,
stateGpu->getForces(),
pme_gpu_get_device_f(fr->pmedata),
- pme_gpu_get_f_ready_synchronizer(fr->pmedata),
+ dependencyList,
useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
// This function call synchronizes the local stream
nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
}
/* Add the force array(s) from nbnxn_atomdata_t to f */
-void reduceForcesGpu(const Nbnxm::AtomLocality locality,
- DeviceBuffer<float> totalForcesDevice,
- const Nbnxm::GridSet &gridSet,
- void *pmeForcesDevice,
- GpuEventSynchronizer *pmeForcesReady,
- gmx_nbnxn_gpu_t *gpu_nbv,
- bool useGpuFPmeReduction,
- bool accumulateForce)
+void reduceForcesGpu(const Nbnxm::AtomLocality locality,
+ DeviceBuffer<float> totalForcesDevice,
+ const Nbnxm::GridSet &gridSet,
+ void *pmeForcesDevice,
+ gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ bool useGpuFPmeReduction,
+ bool accumulateForce)
{
int atomsStart = 0;
int numAtoms = 0;
totalForcesDevice,
gpu_nbv,
pmeForcesDevice,
- pmeForcesReady,
+ dependencyList,
atomsStart, numAtoms,
useGpuFPmeReduction,
accumulateForce);
* \param[out] totalForcesDevice Device buffer to accumulate resulting force.
* \param[in] gridSet The grids data.
* \param[in] pmeForcesDevice Device buffer with PME forces.
- * \param[in] pmeForcesReady Event that signals when the PME forces are ready for the reduction.
+ * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on.
* \param[in] gpu_nbv The NBNXM GPU data structure.
* \param[in] useGpuFPmeReduction Whether PME forces should be added.
* \param[in] accumulateForce Whether there are usefull data already in the total force buffer.
*/
-void reduceForcesGpu(Nbnxm::AtomLocality locality,
- DeviceBuffer<float> totalForcesDevice,
- const Nbnxm::GridSet &gridSet,
- void *pmeForcesDevice,
- GpuEventSynchronizer *pmeForcesReady,
- gmx_nbnxn_gpu_t *gpu_nbv,
- bool useGpuFPmeReduction,
- bool accumulateForce);
+void reduceForcesGpu(Nbnxm::AtomLocality locality,
+ DeviceBuffer<float> totalForcesDevice,
+ const Nbnxm::GridSet &gridSet,
+ void *pmeForcesDevice,
+ gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ bool useGpuFPmeReduction,
+ bool accumulateForce);
/* Add the fshift force stored in nbat to fshift */
void nbnxn_atomdata_add_nbat_fshift_to_fshift(const nbnxn_atomdata_t &nbat,
}
/* F buffer operations on GPU: performs force summations and conversion from nb to rvec format. */
-void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
- DeviceBuffer<float> totalForcesDevice,
- gmx_nbnxn_gpu_t *nb,
- void *pmeForcesDevice,
- GpuEventSynchronizer *pmeForcesReady,
- int atomStart,
- int numAtoms,
- bool useGpuFPmeReduction,
- bool accumulateForce)
+void nbnxn_gpu_add_nbat_f_to_f(const AtomLocality atomLocality,
+ DeviceBuffer<float> totalForcesDevice,
+ gmx_nbnxn_gpu_t *nb,
+ void *pmeForcesDevice,
+ gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
+ int atomStart,
+ int numAtoms,
+ bool useGpuFPmeReduction,
+ bool accumulateForce)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
GMX_ASSERT(numAtoms != 0, "Cannot call function with no atoms");
cudaStream_t stream = nb->stream[iLocality];
cu_atomdata_t *adat = nb->atdat;
- if (useGpuFPmeReduction)
+ size_t gmx_used_in_debug numDependency =
+ static_cast<size_t>((useGpuFPmeReduction == true)) +
+ static_cast<size_t>((accumulateForce == true));
+ GMX_ASSERT(numDependency >= dependencyList.size(), "Mismatching number of dependencies and call signature");
+
+ // Enqueue wait on all dependencies passed
+ for (auto const synchronizer : dependencyList)
{
- //Stream must wait for PME force completion
- pmeForcesReady->enqueueWaitEvent(stream);
+ synchronizer->enqueueWaitEvent(stream);
}
/* launch kernel */
}
void
-nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality locality,
- DeviceBuffer<float> totalForcesDevice,
- void *forcesPmeDevice,
- GpuEventSynchronizer *pmeForcesReady,
- bool useGpuFPmeReduction,
- bool accumulateForce)
+nonbonded_verlet_t::atomdata_add_nbat_f_to_f_gpu(const Nbnxm::AtomLocality locality,
+ DeviceBuffer<float> totalForcesDevice,
+ void *forcesPmeDevice,
+ gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
+ bool useGpuFPmeReduction,
+ bool accumulateForce)
{
GMX_ASSERT((useGpuFPmeReduction == (forcesPmeDevice != nullptr)),
wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
- reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, pmeForcesReady, gpu_nbv, useGpuFPmeReduction, accumulateForce);
+ reduceForcesGpu(locality, totalForcesDevice, pairSearch_->gridSet(), forcesPmeDevice, dependencyList, gpu_nbv, useGpuFPmeReduction, accumulateForce);
wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
* \param [in] locality Local or non-local
* \param [in,out] totalForcesDevice Force to be added to
* \param [in] forcesPmeDevice Device buffer with PME forces
- * \param [in] pmeForcesReady Event triggered when PME force calculation has completed
+ * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on.
* \param [in] useGpuFPmeReduction Whether PME forces should be added
* \param [in] accumulateForce If the total force buffer already contains data
*/
- void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality locality,
- DeviceBuffer<float> totalForcesDevice,
- void *forcesPmeDevice,
- GpuEventSynchronizer *pmeForcesReady,
- bool useGpuFPmeReduction,
- bool accumulateForce);
+ void atomdata_add_nbat_f_to_f_gpu(Nbnxm::AtomLocality locality,
+ DeviceBuffer<float> totalForcesDevice,
+ void *forcesPmeDevice,
+ gmx::ArrayRef<GpuEventSynchronizer* const> dependencyList,
+ bool useGpuFPmeReduction,
+ bool accumulateForce);
/*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
void atomdata_init_add_nbat_f_to_f_gpu();
* \param[in] totalForcesDevice Device buffer to accumulate resulting force.
* \param[in] gpu_nbv The NBNXM GPU data structure.
* \param[in] pmeForcesDevice Device buffer with PME forces.
- * \param[in] pmeForcesReady Event that signals when the PME forces are ready for the reduction.
+ * \param[in] dependencyList List of synchronizers that represent the dependencies the reduction task needs to sync on.
* \param[in] atomStart Index of the first atom to reduce forces for.
* \param[in] numAtoms Number of atoms to reduce forces for.
* \param[in] useGpuFPmeReduction Whether PME forces should be added.
*
*/
CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_add_nbat_f_to_f(AtomLocality gmx_unused atomLocality,
- DeviceBuffer<float> gmx_unused totalForcesDevice,
- gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
- void gmx_unused *pmeForcesDevice,
- GpuEventSynchronizer gmx_unused *pmeForcesReady,
- int gmx_unused atomStart,
- int gmx_unused numAtoms,
- bool gmx_unused useGpuFPmeReduction,
- bool gmx_unused accumulateForce) CUDA_FUNC_TERM;
+void nbnxn_gpu_add_nbat_f_to_f(AtomLocality gmx_unused atomLocality,
+ DeviceBuffer<float> gmx_unused totalForcesDevice,
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ void gmx_unused *pmeForcesDevice,
+ gmx::ArrayRef<GpuEventSynchronizer* const> gmx_unused dependencyList,
+ int gmx_unused atomStart,
+ int gmx_unused numAtoms,
+ bool gmx_unused useGpuFPmeReduction,
+ bool gmx_unused accumulateForce) CUDA_FUNC_TERM;
/*! \brief Wait for GPU stream to complete */
CUDA_FUNC_QUALIFIER