Make the wait on nonbonded GPU results conditional

author Szilárd Páll <pall.szilard@gmail.com>

Fri, 27 Sep 2019 17:21:01 +0000 (19:21 +0200)

committer Paul Bauer <paul.bauer.q@gmail.com>

Wed, 16 Oct 2019 06:50:09 +0000 (08:50 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Fri, 27 Sep 2019 17:21:01 +0000 (19:21 +0200)
committer Paul Bauer <paul.bauer.q@gmail.com>
Wed, 16 Oct 2019 06:50:09 +0000 (08:50 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 5c40ff7f165c710376db8a40c833b6551d7b1280..11ef7cd0daebe3f1d28fdfc28ad2bd213a61a262 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1323,15 +1323,13 @@ void do_force(FILE                                     *fplog,
          wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
          wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
-        bool copyBackNbForce  = (useGpuFBufOps == BufferOpsUseGpu::False);
-
          if (havePPDomainDecomposition(cr))
          {
              Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                      stepWork, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+                                      stepWork, Nbnxm::AtomLocality::NonLocal);
          }
          Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
-                                  stepWork, Nbnxm::AtomLocality::Local, copyBackNbForce);
+                                  stepWork, Nbnxm::AtomLocality::Local);
          wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
  
          if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 201a97055fdce93cb7df1b0b8de171590299cb44..7c7e7d097b4dd05994b6c65abc12fc8147977a0a 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -643,8 +643,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
  void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
                          nbnxn_atomdata_t        *nbatom,
                          const gmx::StepWorkload &stepWork,
-                        const AtomLocality       atomLocality,
-                        const bool               copyBackNbForce)
+                        const AtomLocality       atomLocality)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
@@ -682,8 +681,10 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
          CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
      }
  
-    /* DtoH f */
-    if (copyBackNbForce)
+    /* DtoH f
+     * Skip if buffer ops / reduction is offloaded to the GPU.
+     */
+    if (!stepWork.useGpuFBufferOps)
      {
          cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
                            (adat_len)*sizeof(*adat->f), stream);
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h

index f39e40ea8a2dd251cbf501f22d1a6034363f6b0b..5127db3c57cebb626092e1582adf68309e724a01 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -382,6 +382,17 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
      /* determine interaction locality from atom locality */
      const InteractionLocality iLocality = gpuAtomToInteractionLocality(aloc);
  
+
+    // Transfers are launched and therefore need to be waited on if:
+    // - buffer ops is not offloaded
+    // - energies or virials are needed (on the local stream)
+    //
+    // (Note that useGpuFBufferOps and computeVirial are mutually exclusive
+    // in current code as virial steps do CPU reduction.)
+    const bool haveResultToWaitFor =
+        (!stepWork.useGpuFBufferOps ||
+         (aloc == AtomLocality::Local && (stepWork.computeEnergy || stepWork.computeVirial)));
+
      //  We skip when during the non-local phase there was actually no work to do.
      //  This is consistent with nbnxn_gpu_launch_kernel but it also considers possible
      //  bonded GPU work.
@@ -407,16 +418,22 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
  
              wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
          }
-        else
+        else if (haveResultToWaitFor)
          {
              gpuStreamSynchronize(nb->stream[iLocality]);
          }
  
+        // TODO: this needs to be moved later because conditional wait could brake timing
+        // with a future OpenCL implementation, but with CUDA timing is anyway disabled
+        // in all cases where we skip the wait.
          gpu_accumulate_timings(nb->timings, nb->timers, nb->plist[iLocality], aloc, stepWork,
                                 nb->bDoTime != 0);
  
-        gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
-                                  e_lj, e_el, as_rvec_array(shiftForces.data()));
+        if (stepWork.computeEnergy || stepWork.computeVirial)
+        {
+            gpu_reduce_staged_outputs(nb->nbst, iLocality, stepWork.computeEnergy, stepWork.computeVirial,
+                                      e_lj, e_el, as_rvec_array(shiftForces.data()));
+        }
      }
  
      /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index b0941b46b0b1ff8f98a9b82d8ae307f9e0d3cc03..c6eef3fa800538205ce066ba8cff22ae041bc0ae 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -146,8 +146,7 @@ GPU_FUNC_QUALIFIER
  void gpu_launch_cpyback(gmx_nbnxn_gpu_t         gmx_unused *nb,
                          nbnxn_atomdata_t        gmx_unused *nbatom,
                          const gmx::StepWorkload gmx_unused &stepWork,
-                        AtomLocality            gmx_unused aloc,
-                        bool                    gmx_unused copyBackNbForce) GPU_FUNC_TERM;
+                        AtomLocality            gmx_unused aloc) GPU_FUNC_TERM;
  
  /*! \brief Attempts to complete nonbonded GPU task.
   *
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 2ce49b2eb995736ec74de1d52799fd071af94d36..867f289da5f2f23223810c381276dc62a8bc6f59 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -735,8 +735,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
  void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
                          struct nbnxn_atomdata_t                  *nbatom,
                          const gmx::StepWorkload                  &stepWork,
-                        const AtomLocality                        aloc,
-                        const bool                     gmx_unused copyBackNbForce)
+                        const AtomLocality                        aloc)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
author	Szilárd Páll <pall.szilard@gmail.com>
	Fri, 27 Sep 2019 17:21:01 +0000 (19:21 +0200)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Wed, 16 Oct 2019 06:50:09 +0000 (08:50 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda.cu		patch \| blob \| history
src/gromacs/nbnxm/gpu_common.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp		patch \| blob \| history