wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- bool copyBackNbForce = (useGpuFBufOps == BufferOpsUseGpu::False);
-
if (havePPDomainDecomposition(cr))
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- stepWork, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+ stepWork, Nbnxm::AtomLocality::NonLocal);
}
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- stepWork, Nbnxm::AtomLocality::Local, copyBackNbForce);
+ stepWork, Nbnxm::AtomLocality::Local);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb,
nbnxn_atomdata_t *nbatom,
const gmx::StepWorkload &stepWork,
- const AtomLocality atomLocality,
- const bool copyBackNbForce)
+ const AtomLocality atomLocality)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
}
- /* DtoH f */
- if (copyBackNbForce)
+ /* DtoH f
+ * Skip if buffer ops / reduction is offloaded to the GPU.
+ */
+ if (!stepWork.useGpuFBufferOps)
{
cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
(adat_len)*sizeof(*adat->f), stream);
/* determine interaction locality from atom locality */
const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
+
+ // Transfers are launched and therefore need to be waited on if:
+ // - buffer ops is not offloaded
+ // - energies or virials are needed (on the local stream)
+ //
+ // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
+ // in current code as virial steps do CPU reduction.)
+ const bool haveResultToWaitFor =
+ (!stepWork.useGpuFBufferOps ||
+ (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
+
// We skip when during the non-local phase there was actually no work to do.
// This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
// bonded GPU work.
wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
}
- else
+ else if (haveResultToWaitFor)
{
gpuStreamSynchronize(nb->stream[iLocality]);
}
+ // TODO: this needs to be moved later because conditional wait could brake timing
+ // with a future OpenCL implementation, but with CUDA timing is anyway disabled
+ // in all cases where we skip the wait.
gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork,
nb->bDoTime != 0);
- gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
- e_lj, e_el, as_rvec_array(shiftForces.data()));
+ if (stepWork.computeEnergy || stepWork.computeVirial)
+ {
+ gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
+ e_lj, e_el, as_rvec_array(shiftForces.data()));
+ }
}
/* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb,
nbnxn_atomdata_t gmx_unused *nbatom,
const gmx::StepWorkload gmx_unused &stepWork,
- AtomLocality gmx_unused aloc,
- bool gmx_unused copyBackNbForce) GPU_FUNC_TERM;
+ AtomLocality gmx_unused aloc) GPU_FUNC_TERM;
/*! \brief Attempts to complete nonbonded GPU task.
*
void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb,
struct nbnxn_atomdata_t *nbatom,
const gmx::StepWorkload &stepWork,
- const AtomLocality aloc,
- const bool gmx_unused copyBackNbForce)
+ const AtomLocality aloc)
{
GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");