Make the wait on nonbonded GPU results conditional
[alexxy/gromacs.git] / src / gromacs / nbnxm / cuda / nbnxm_cuda.cu
index 201a97055fdce93cb7df1b0b8de171590299cb44..7c7e7d097b4dd05994b6c65abc12fc8147977a0a 100644 (file)
@@ -643,8 +643,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t          *nb,
 void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
                         nbnxn_atomdata_t        *nbatom,
                         const gmx::StepWorkload &stepWork,
-                        const AtomLocality       atomLocality,
-                        const bool               copyBackNbForce)
+                        const AtomLocality       atomLocality)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -682,8 +681,10 @@ void gpu_launch_cpyback(gmx_nbnxn_cuda_t        *nb,
         CU_RET_ERR(stat, "cudaStreamWaitEvent on nonlocal_done failed");
     }
 
-    /* DtoH f */
-    if (copyBackNbForce)
+    /* DtoH f
+     * Skip if buffer ops / reduction is offloaded to the GPU.
+     */
+    if (!stepWork.useGpuFBufferOps)
     {
         cu_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f + adat_begin,
                           (adat_len)*sizeof(*adat->f), stream);