{
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
- nbv->wait_stream_gpu(Nbnxm::AtomLocality::NonLocal);
+ nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::NonLocal);
}
dd_move_f(cr->dd, force.unpaddedArrayRef(), fr->fshift, wcycle);
}
if (bUseOrEmulGPU && !alternateGpuWait)
{
+ // TODO: move these steps as early as possible:
+ // - CPU f H2D should be as soon as all CPU-side forces are done
+ // - wait for force reduction does not need to block host (at least not here, it's sufficient to wait
+ // before the next CPU task that consumes the forces: vsite spread or update)
+ //
if (useGpuFBufOps == BufferOpsUseGpu::True && haveCpuForces)
{
nbv->launch_copy_f_to_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
if (useGpuFBufOps == BufferOpsUseGpu::True)
{
nbv->launch_copy_f_from_gpu(forceOut.f, Nbnxm::AtomLocality::Local);
- nbv->wait_stream_gpu(Nbnxm::AtomLocality::Local);
+ nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
}
}
int gridId,
int numColumnsMax)
{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ GMX_ASSERT(x, "Need a valid x pointer");
+
cu_atomdata_t *adat = nb->atdat;
bool bDoTime = nb->bDoTime;
{
d_x = (rvec*) xPmeDevicePtr;
}
+ GMX_ASSERT(d_x, "Need a valid d_x pointer");
/* launch kernel on GPU */
int nAtoms,
GpuBufferOpsAccumulateForce accumulateForce)
{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
- cu_atomdata_t *adat = nb->atdat;
- cudaStream_t stream = atomLocality == AtomLocality::Local ?
- nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+ const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+ cudaStream_t stream = nb->stream[iLocality];
+
+ cu_atomdata_t *adat = nb->atdat;
/* launch kernel */
gmx_nbnxn_gpu_t *nb,
rvec *f)
{
- cudaStream_t stream = atomLocality == AtomLocality::Local ?
- nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
- bool bDoTime = nb->bDoTime;
- cu_timers_t *t = nb->timers;
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ GMX_ASSERT(f, "Need a valid f pointer");
- int atomStart = 0, nAtoms = 0;
+ const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+ cudaStream_t stream = nb->stream[iLocality];
+
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t *t = nb->timers;
+
+ int atomStart = 0, nAtoms = 0;
nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
gmx_nbnxn_gpu_t *nb,
rvec *f)
{
- cudaStream_t stream = atomLocality == AtomLocality::Local ?
- nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
- bool bDoTime = nb->bDoTime;
- cu_timers_t *t = nb->timers;
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+ GMX_ASSERT(f, "Need a valid f pointer");
- int atomStart = 0, nAtoms = 0;
+ const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
+ cudaStream_t stream = nb->stream[iLocality];
+
+ bool bDoTime = nb->bDoTime;
+ cu_timers_t *t = nb->timers;
+ int atomStart, nAtoms;
nbnxn_get_atom_range(atomLocality, gridSet, &atomStart, &nAtoms);
t->xf[atomLocality].nb_d2h.openTimingRegion(stream);
}
+ GMX_ASSERT(nb->frvec, "Need a valid nb->frvec pointer");
rvec *ptrDest = reinterpret_cast<rvec *> (f[atomStart]);
rvec *ptrSrc = reinterpret_cast<rvec *> (nb->frvec[atomStart]);
//copyFromDeviceBuffer(ptrDest, &ptrSrc, 0, nAtoms,
return;
}
-void nbnxn_wait_stream_gpu(const AtomLocality gmx_unused atomLocality,
- gmx_nbnxn_gpu_t *nb)
+void nbnxn_wait_for_gpu_force_reduction(const AtomLocality gmx_unused atomLocality,
+ gmx_nbnxn_gpu_t *nb)
{
+ GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+ const InteractionLocality iLocality = gpuAtomToInteractionLocality(atomLocality);
- cudaStream_t stream = atomLocality == AtomLocality::Local ?
- nb->stream[InteractionLocality::Local] : nb->stream[InteractionLocality::NonLocal];
+ cudaStream_t stream = nb->stream[iLocality];
cudaStreamSynchronize(stream);