From 1b346f03a1fcd2c73e0b0c0982585fc6f852e6f8 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Fri, 27 Sep 2019 19:21:01 +0200
Subject: [PATCH] Make the wait on nonbonded GPU results conditional

When the force reduction is done on the GPU and there are no energy or
shift force results required, there is no need to block and wait on the
CPU until the GPU nonbonded kernels complete.
This change makes the wait conditional on whether there are nonbonded
force, energy or shift force outputs so the blocking wait is now skipped
with GPU buffer ops on force-only steps.

Also removed the now unnecessary boolean argument passed to
gpu_launch_cpyback().

Refs #3128

Change-Id: Ic1285f5a00ac910cd1d6c4358f41f2c7c41dea4c
---
 src/gromacs/mdlib/sim_util.cpp         |  6 ++----
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu   |  9 +++++----
 src/gromacs/nbnxm/gpu_common.h         | 23 ++++++++++++++++++++---
 src/gromacs/nbnxm/nbnxm_gpu.h          |  3 +--
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp |  3 +--
 5 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 5c40ff7f16..11ef7cd0da 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1323,15 +1323,13 @@ void do_force(FILE                                     *fplog,
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
-        bool copyBackNbForce  = (useGpuFBufOps == BufferOpsUseGpu::False);
-
         if (havePPDomainDecomposition(cr))
         {
             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                      stepWork, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+                                      stepWork, Nbnxm::AtomLocality::NonLocal);
         }
         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                  stepWork, Nbnxm::AtomLocality::Local, copyBackNbForce);
+                                  stepWork, Nbnxm::AtomLocality::Local);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
         if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 201a97055f..7c7e7d097b 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -643,8 +643,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
 void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
                         nbnxn_atomdata_t        *nbatom,
                         const gmx::StepWorkload &stepWork,
-                        const AtomLocality       atomLocality,
-                        const bool               copyBackNbForce)
+                        const AtomLocality       atomLocality)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -682,8 +681,10 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
     }
 
-    /* DtoH f */
-    if (copyBackNbForce)
+    /* DtoH f
+     * Skip if buffer ops / reduction is offloaded to the GPU.
+     */
+    if (!stepWork.useGpuFBufferOps)
     {
         cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
                           (adat_len)*sizeof(*adat->f), stream);
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index f39e40ea8a..5127db3c57 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -382,6 +382,17 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
     /* determine interaction locality from atom locality */
     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 
+
+    // Transfers are launched and therefore need to be waited on if:
+    // - buffer ops is not offloaded
+    // - energies or virials are needed (on the local stream)
+    //
+    // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
+    // in current code as virial steps do CPU reduction.)
+    const bool haveResultToWaitFor =
+        (!stepWork.useGpuFBufferOps ||
+         (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
+
     //  We skip when during the non-local phase there was actually no work to do.
     //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
     //  bonded GPU work.
@@ -407,16 +418,22 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
 
             wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
         }
-        else
+        else if (haveResultToWaitFor)
         {
             gpuStreamSynchronize(nb->stream[iLocality]);
         }
 
+        // TODO: this needs to be moved later because conditional wait could brake timing
+        // with a future OpenCL implementation, but with CUDA timing is anyway disabled
+        // in all cases where we skip the wait.
         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork,
                                nb->bDoTime != 0);
 
-        gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
-                                  e_lj, e_el, as_rvec_array(shiftForces.data()));
+        if (stepWork.computeEnergy || stepWork.computeVirial)
+        {
+            gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
+                                      e_lj, e_el, as_rvec_array(shiftForces.data()));
+        }
     }
 
     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h
index b0941b46b0..c6eef3fa80 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -146,8 +146,7 @@ GPU_FUNC_QUALIFIER
 void gpu_launch_cpyback(gmx_nbnxn_gpu_t         gmx_unused *nb,
                         nbnxn_atomdata_t        gmx_unused *nbatom,
                         const gmx::StepWorkload gmx_unused &stepWork,
-                        AtomLocality            gmx_unused aloc,
-                        bool                    gmx_unused copyBackNbForce) GPU_FUNC_TERM;
+                        AtomLocality            gmx_unused aloc) GPU_FUNC_TERM;
 
 /*! \brief Attempts to complete nonbonded GPU task.
  *
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 2ce49b2eb9..867f289da5 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -735,8 +735,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
 void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
                         struct nbnxn_atomdata_t                  *nbatom,
                         const gmx::StepWorkload                  &stepWork,
-                        const AtomLocality                        aloc,
-                        const bool                     gmx_unused copyBackNbForce)
+                        const AtomLocality                        aloc)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
-- 
2.22.0