When the force reduction is done on the GPU and there are no energy or
shift force results required, there is no need to block and wait on the
CPU until the GPU nonbonded kernels complete.
This change makes the wait conditional on whether there are nonbonded
force, energy or shift force outputs so the blocking wait is now skipped
with GPU buffer ops on force-only steps.
Also removed the now unnecessary boolean argument passed to
gpu_launch_cpyback().
Refs #3128
Change-Id: Ic1285f5a00ac910cd1d6c4358f41f2c7c41dea4c
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- bool copyBackNbForce = (useGpuFBufOps == BufferOpsUseGpu::False);
-
if (havePPDomainDecomposition(cr))
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
if (havePPDomainDecomposition(cr))
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- stepWork, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+ stepWork, Nbnxm::AtomLocality::NonLocal);
}
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
}
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- stepWork, Nbnxm::AtomLocality::Local, copyBackNbForce);
+ stepWork, Nbnxm::AtomLocality::Local);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
nbnxn_atomdata_t *nbatom,
const gmx::StepWorkload &stepWork,
void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
nbnxn_atomdata_t *nbatom,
const gmx::StepWorkload &stepWork,
- const AtomLocality atomLocality,
- const bool copyBackNbForce)
+ const AtomLocality atomLocality)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
}
CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
}
- /* DtoH f */
- if (copyBackNbForce)
+ /* DtoH f
+ * Skip if buffer ops / reduction is offloaded to the GPU.
+ */
+ if (!stepWork.useGpuFBufferOps)
{
cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
(adat_len)*sizeof(*adat->f), stream);
{
cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
(adat_len)*sizeof(*adat->f), stream);
/* determine interaction locality from atom locality */
const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
/* determine interaction locality from atom locality */
const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
+
+ // Transfers are launched and therefore need to be waited on if:
+ // - buffer ops is not offloaded
+ // - energies or virials are needed (on the local stream)
+ //
+ // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
+ // in current code as virial steps do CPU reduction.)
+ const bool haveResultToWaitFor =
+ (!stepWork.useGpuFBufferOps ||
+ (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
+
// We skip when during the non-local phase there was actually no work to do.
// This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
// bonded GPU work.
// We skip when during the non-local phase there was actually no work to do.
// This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
// bonded GPU work.
wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
}
wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
}
+ else if (haveResultToWaitFor)
{
gpuStreamSynchronize(nb->stream[iLocality]);
}
{
gpuStreamSynchronize(nb->stream[iLocality]);
}
+ // TODO: this needs to be moved later because conditional wait could brake timing
+ // with a future OpenCL implementation, but with CUDA timing is anyway disabled
+ // in all cases where we skip the wait.
gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork,
nb->bDoTime != 0);
gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork,
nb->bDoTime != 0);
- gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
- e_lj, e_el, as_rvec_array(shiftForces.data()));
+ if (stepWork.computeEnergy || stepWork.computeVirial)
+ {
+ gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
+ e_lj, e_el, as_rvec_array(shiftForces.data()));
+ }
}
/* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
}
/* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
nbnxn_atomdata_t gmx_unused *nbatom,
const gmx::StepWorkload gmx_unused &stepWork,
void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
nbnxn_atomdata_t gmx_unused *nbatom,
const gmx::StepWorkload gmx_unused &stepWork,
- AtomLocality gmx_unused aloc,
- bool gmx_unused copyBackNbForce) GPU_FUNC_TERM;
+ AtomLocality gmx_unused aloc) GPU_FUNC_TERM;
/*! \brief Attempts to complete nonbonded GPU task.
*
/*! \brief Attempts to complete nonbonded GPU task.
*
void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
struct nbnxn_atomdata_t *nbatom,
const gmx::StepWorkload &stepWork,
void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
struct nbnxn_atomdata_t *nbatom,
const gmx::StepWorkload &stepWork,
- const AtomLocality aloc,
- const bool gmx_unused copyBackNbForce)
+ const AtomLocality aloc)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");