Make the wait on nonbonded GPU results conditional
authorSzilárd Páll <pall.szilard@gmail.com>
Fri, 27 Sep 2019 17:21:01 +0000 (19:21 +0200)
committerPaul Bauer <paul.bauer.q@gmail.com>
Wed, 16 Oct 2019 06:50:09 +0000 (08:50 +0200)
When the force reduction is done on the GPU and there are no energy or
shift force results required, there is no need to block and wait on the
CPU until the GPU nonbonded kernels complete.
This change makes the wait conditional on whether there are nonbonded
force, energy or shift force outputs so the blocking wait is now skipped
with GPU buffer ops on force-only steps.

Also removed the now unnecessary boolean argument passed to
gpu_launch_cpyback().

Refs #3128

Change-Id: Ic1285f5a00ac910cd1d6c4358f41f2c7c41dea4c

src/gromacs/mdlib/sim_util.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
src/gromacs/nbnxm/gpu_common.h
src/gromacs/nbnxm/nbnxm_gpu.h
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 5c40ff7f165c710376db8a40c833b6551d7b1280..11ef7cd0daebe3f1d28fdfc28ad2bd213a61a262 100644 (file)
@@ -1323,15 +1323,13 @@ void do_force(FILE                                     *fplog,
         wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
-        bool copyBackNbForce  = (useGpuFBufOps == BufferOpsUseGpu::False);
-
         if (havePPDomainDecomposition(cr))
         {
             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                      stepWork, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+                                      stepWork, Nbnxm::AtomLocality::NonLocal);
         }
         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                  stepWork, Nbnxm::AtomLocality::Local, copyBackNbForce);
+                                  stepWork, Nbnxm::AtomLocality::Local);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
 
         if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
index 201a97055fdce93cb7df1b0b8de171590299cb44..7c7e7d097b4dd05994b6c65abc12fc8147977a0a 100644 (file)
@@ -643,8 +643,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
 void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
                         nbnxn_atomdata_t        *nbatom,
                         const gmx::StepWorkload &stepWork,
-                        const AtomLocality       atomLocality,
-                        const bool               copyBackNbForce)
+                        const AtomLocality       atomLocality)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -682,8 +681,10 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
     }
 
-    /* DtoH f */
-    if (copyBackNbForce)
+    /* DtoH f
+     * Skip if buffer ops / reduction is offloaded to the GPU.
+     */
+    if (!stepWork.useGpuFBufferOps)
     {
         cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
                           (adat_len)*sizeof(*adat->f), stream);
index f39e40ea8a2dd251cbf501f22d1a6034363f6b0b..5127db3c57cebb626092e1582adf68309e724a01 100644 (file)
@@ -382,6 +382,17 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
     /* determine interaction locality from atom locality */
     const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
 
+
+    // Transfers are launched and therefore need to be waited on if:
+    // - buffer ops is not offloaded
+    // - energies or virials are needed (on the local stream)
+    //
+    // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
+    // in current code as virial steps do CPU reduction.)
+    const bool haveResultToWaitFor =
+        (!stepWork.useGpuFBufferOps ||
+         (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
+
     //  We skip when during the non-local phase there was actually no work to do.
     //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
     //  bonded GPU work.
@@ -407,16 +418,22 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
 
             wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
         }
-        else
+        else if (haveResultToWaitFor)
         {
             gpuStreamSynchronize(nb->stream[iLocality]);
         }
 
+        // TODO: this needs to be moved later because conditional wait could brake timing
+        // with a future OpenCL implementation, but with CUDA timing is anyway disabled
+        // in all cases where we skip the wait.
         gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork,
                                nb->bDoTime != 0);
 
-        gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
-                                  e_lj, e_el, as_rvec_array(shiftForces.data()));
+        if (stepWork.computeEnergy || stepWork.computeVirial)
+        {
+            gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
+                                      e_lj, e_el, as_rvec_array(shiftForces.data()));
+        }
     }
 
     /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
index b0941b46b0b1ff8f98a9b82d8ae307f9e0d3cc03..c6eef3fa800538205ce066ba8cff22ae041bc0ae 100644 (file)
@@ -146,8 +146,7 @@ GPU_FUNC_QUALIFIER
 void gpu_launch_cpyback(gmx_nbnxn_gpu_t         gmx_unused *nb,
                         nbnxn_atomdata_t        gmx_unused *nbatom,
                         const gmx::StepWorkload gmx_unused &stepWork,
-                        AtomLocality            gmx_unused aloc,
-                        bool                    gmx_unused copyBackNbForce) GPU_FUNC_TERM;
+                        AtomLocality            gmx_unused aloc) GPU_FUNC_TERM;
 
 /*! \brief Attempts to complete nonbonded GPU task.
  *
index 2ce49b2eb995736ec74de1d52799fd071af94d36..867f289da5f2f23223810c381276dc62a8bc6f59 100644 (file)
@@ -735,8 +735,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
 void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
                         struct nbnxn_atomdata_t                  *nbatom,
                         const gmx::StepWorkload                  &stepWork,
-                        const AtomLocality                        aloc,
-                        const bool                     gmx_unused copyBackNbForce)
+                        const AtomLocality                        aloc)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");