Fix conditional on when DtoH forces copy occur
authorArtem Zhmurov <zhmurov@gmail.com>
Wed, 31 Mar 2021 00:30:33 +0000 (00:30 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 31 Mar 2021 00:30:33 +0000 (00:30 +0000)
d2d4a50b4c636c203028c5bff311924ec15e7825 introduced performance
regression with forces copied from device to host on each step.
This fixes the issue by reinstantiating proper condition on the
copy call.

Fixes #4001
Refs #2608

src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

index afd969ee7f8d7c34cd1c52211993cd7ce3bb5106..63f8317443314c3886a53f24a1a8724d6324b55c 100644 (file)
@@ -815,17 +815,21 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
     }
 
     /* DtoH f */
-    static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
-                  "The host force buffer should be in single precision to match device data size.");
-    copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
-                         &adat->f,
-                         atomsRange.begin(),
-                         atomsRange.size(),
-                         deviceStream,
-                         GpuApiCallBehavior::Async,
-                         bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-
-    issueClFlushInStream(deviceStream);
+    if (!stepWork.useGpuFBufferOps)
+    {
+        static_assert(
+                sizeof(*nbatom->out[0].f.data()) == sizeof(float),
+                "The host force buffer should be in single precision to match device data size.");
+        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
+                             &adat->f,
+                             atomsRange.begin(),
+                             atomsRange.size(),
+                             deviceStream,
+                             GpuApiCallBehavior::Async,
+                             bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+
+        issueClFlushInStream(deviceStream);
+    }
 
     /* After the non-local D2H is launched the nonlocal_done event can be
        recorded which signals that the local D2H can proceed. This event is not