Make the wait on nonbonded GPU results conditional

[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_cuda.cu
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu

index 201a97055fdce93cb7df1b0b8de171590299cb44..7c7e7d097b4dd05994b6c65abc12fc8147977a0a 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -643,8 +643,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
  void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
                          nbnxn_atomdata_t        *nbatom,
                          const gmx::StepWorkload &stepWork,
-                        const AtomLocality       atomLocality,
-                        const bool               copyBackNbForce)
+                        const AtomLocality       atomLocality)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
@@ -682,8 +681,10 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
          CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
      }
  
-    /* DtoH f */
-    if (copyBackNbForce)
+    /* DtoH f
+     * Skip if buffer ops / reduction is offloaded to the GPU.
+     */
+    if (!stepWork.useGpuFBufferOps)
      {
          cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
                            (adat_len)*sizeof(*adat->f), stream);