From 1b346f03a1fcd2c73e0b0c0982585fc6f852e6f8 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= Date: Fri, 27 Sep 2019 19:21:01 +0200 Subject: [PATCH] Make the wait on nonbonded GPU results conditional When the force reduction is done on the GPU and there are no energy or shift force results required, there is no need to block and wait on the CPU until the GPU nonbonded kernels complete. This change makes the wait conditional on whether there are nonbonded force, energy or shift force outputs so the blocking wait is now skipped with GPU buffer ops on force-only steps. Also removed the now unnecessary boolean argument passed to gpu_launch_cpyback(). Refs #3128 Change-Id: Ic1285f5a00ac910cd1d6c4358f41f2c7c41dea4c --- src/gromacs/mdlib/sim_util.cpp | 6 ++---- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 9 +++++---- src/gromacs/nbnxm/gpu_common.h | 23 ++++++++++++++++++++--- src/gromacs/nbnxm/nbnxm_gpu.h | 3 +-- src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 3 +-- 5 files changed, 29 insertions(+), 15 deletions(-) diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 5c40ff7f16..11ef7cd0da 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1323,15 +1323,13 @@ void do_force(FILE *fplog, wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU); wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED); - bool copyBackNbForce = (useGpuFBufOps == BufferOpsUseGpu::False); - if (havePPDomainDecomposition(cr)) { Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), - stepWork, Nbnxm::AtomLocality::NonLocal, copyBackNbForce); + stepWork, Nbnxm::AtomLocality::NonLocal); } Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), - stepWork, Nbnxm::AtomLocality::Local, copyBackNbForce); + stepWork, Nbnxm::AtomLocality::Local); wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED); if (domainWork.haveGpuBondedWork && stepWork.computeEnergy) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 201a97055f..7c7e7d097b 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -643,8 +643,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t *nb, void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, nbnxn_atomdata_t *nbatom, const gmx::StepWorkload &stepWork, - const AtomLocality atomLocality, - const bool copyBackNbForce) + const AtomLocality atomLocality) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); @@ -682,8 +681,10 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t *nb, CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed"); } - /* DtoH f */ - if (copyBackNbForce) + /* DtoH f + * Skip if buffer ops / reduction is offloaded to the GPU. + */ + if (!stepWork.useGpuFBufferOps) { cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin, (adat_len)*sizeof(*adat->f), stream); diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index f39e40ea8a..5127db3c57 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -382,6 +382,17 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, /* determine interaction locality from atom locality */ const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc); + + // Transfers are launched and therefore need to be waited on if: + // - buffer ops is not offloaded + // - energies or virials are needed (on the local stream) + // + // (Note that useGpuFBufferOps and computeVirial are mutually exclusive + // in current code as virial steps do CPU reduction.) + const bool haveResultToWaitFor = + (!stepWork.useGpuFBufferOps || + (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial))); + // We skip when during the non-local phase there was actually no work to do. // This is consistent with nbnxn_gpu_launch_kernel but it also considers possible // bonded GPU work. @@ -407,16 +418,22 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t *nb, wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L); } - else + else if (haveResultToWaitFor) { gpuStreamSynchronize(nb->stream[iLocality]); } + // TODO: this needs to be moved later because conditional wait could brake timing + // with a future OpenCL implementation, but with CUDA timing is anyway disabled + // in all cases where we skip the wait. gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork, nb->bDoTime != 0); - gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial, - e_lj, e_el, as_rvec_array(shiftForces.data())); + if (stepWork.computeEnergy || stepWork.computeVirial) + { + gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial, + e_lj, e_el, as_rvec_array(shiftForces.data())); + } } /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */ diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h index b0941b46b0..c6eef3fa80 100644 --- a/src/gromacs/nbnxm/nbnxm_gpu.h +++ b/src/gromacs/nbnxm/nbnxm_gpu.h @@ -146,8 +146,7 @@ GPU_FUNC_QUALIFIER void gpu_launch_cpyback(gmx_nbnxn_gpu_t gmx_unused *nb, nbnxn_atomdata_t gmx_unused *nbatom, const gmx::StepWorkload gmx_unused &stepWork, - AtomLocality gmx_unused aloc, - bool gmx_unused copyBackNbForce) GPU_FUNC_TERM; + AtomLocality gmx_unused aloc) GPU_FUNC_TERM; /*! \brief Attempts to complete nonbonded GPU task. * diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 2ce49b2eb9..867f289da5 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -735,8 +735,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t *nb, void gpu_launch_cpyback(gmx_nbnxn_ocl_t *nb, struct nbnxn_atomdata_t *nbatom, const gmx::StepWorkload &stepWork, - const AtomLocality aloc, - const bool gmx_unused copyBackNbForce) + const AtomLocality aloc) { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); -- 2.22.0