// NS step is also a virial step (on which f buf ops are deactivated).
if (simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA))
{
- nbv->atomdata_init_add_nbat_f_to_f_gpu();
+ GMX_ASSERT(stateGpu, "stateGpu should be valid here");
+ nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
}
}
else if (!EI_TPI(inputrec->eI))
pme_gpu_get_device_f(fr->pmedata),
dependencyList,
false, haveNonLocalForceContribInCpuBuffer);
+ // TODO: this should be conditional on whether GPU direct comm is used?
stateGpu->copyForcesFromGpu(forceOut.forceWithShiftForces().force(), gmx::StatePropagatorDataGpu::AtomLocality::NonLocal);
}
else
launchGpuKernel(kernelFn, config, nullptr, "FbufferOps", kernelArgs);
+ if (atomLocality == AtomLocality::Local)
+ {
+ GMX_ASSERT(nb->localFReductionDone != nullptr, "localFReductionDone has to be a valid pointer");
+ nb->localFReductionDone->markEvent(stream);
+ }
}
void* nbnxn_get_x_on_device_event(const gmx_nbnxn_cuda_t *nb)
}
/* Initialization for F buffer operations on GPU. */
-void nbnxn_gpu_init_add_nbat_f_to_f(const int *cell,
- gmx_nbnxn_gpu_t *gpu_nbv,
- int natoms_total)
+void nbnxn_gpu_init_add_nbat_f_to_f(const int *cell,
+ gmx_nbnxn_gpu_t *gpu_nbv,
+ int natoms_total,
+ GpuEventSynchronizer* const localReductionDone)
{
cudaStream_t stream = gpu_nbv->stream[InteractionLocality::Local];
+ GMX_ASSERT(localReductionDone, "localReductionDone should be a valid pointer");
+ gpu_nbv->localFReductionDone = localReductionDone;
+
if (natoms_total > 0)
{
reallocateDeviceBuffer(&gpu_nbv->cell, natoms_total, &gpu_nbv->ncell, &gpu_nbv->ncell_alloc, nullptr);
any dependent task (e.g. transfer of coordinates
to the PME rank's GPU) can proceed. */
+ /*! \brief Pointer to event synchronizer triggered when the local GPU buffer ops / reduction is complete
+ *
+ * \note That the synchronizer is managed outside of this module in StatePropagatorDataGpu.
+ */
+ GpuEventSynchronizer *localFReductionDone;
+
GpuEventSynchronizer *xNonLocalCopyD2HDone; /**< event triggered when
non-local coordinate buffer has been
copied from device to host*/
}
void
-nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu()
+nonbonded_verlet_t::atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* const localReductionDone)
{
wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
Nbnxm::nbnxn_gpu_init_add_nbat_f_to_f(gridSet.cells().data(),
gpu_nbv,
- gridSet.numRealAtomsTotal());
+ gridSet.numRealAtomsTotal(),
+ localReductionDone);
wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
bool useGpuFPmeReduction,
bool accumulateForce);
- /*! \brief Outer body of function to perform initialization for F buffer operations on GPU. */
- void atomdata_init_add_nbat_f_to_f_gpu();
+ /*! \brief Outer body of function to perform initialization for F buffer operations on GPU.
+ *
+ * \param localReductionDone Pointer to an event synchronizer that marks the completion of the local f buffer ops kernel.
+ */
+ void atomdata_init_add_nbat_f_to_f_gpu(GpuEventSynchronizer* localReductionDone);
/*! \brief return pointer to GPU event recorded when coordinates have been copied to device */
void* get_x_on_device_event();
/*! \brief Initialization for F buffer operations on GPU */
CUDA_FUNC_QUALIFIER
-void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell,
- gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
- int gmx_unused natoms_total) CUDA_FUNC_TERM;
+void nbnxn_gpu_init_add_nbat_f_to_f(const int gmx_unused *cell,
+ gmx_nbnxn_gpu_t gmx_unused *gpu_nbv,
+ int gmx_unused natoms_total,
+ GpuEventSynchronizer gmx_unused *localReductionDone) CUDA_FUNC_TERM;
/*! \brief Force buffer operations on GPU.
*