Allow overlapping CPU force H2D with compute

author Szilárd Páll <pall.szilard@gmail.com>

Mon, 2 Dec 2019 18:29:28 +0000 (19:29 +0100)

committer Berk Hess <hess@kth.se>

Thu, 12 Dec 2019 21:12:25 +0000 (22:12 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Mon, 2 Dec 2019 18:29:28 +0000 (19:29 +0100)
committer Berk Hess <hess@kth.se>
Thu, 12 Dec 2019 21:12:25 +0000 (22:12 +0100)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 832ff21620aa2b325108257c605753d817e441bd..86549f915ba519384fcda1e4451647aa8d3aabd3 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1706,9 +1706,16 @@ void do_force(FILE*                               fplog,
              //   These should be unified.
              if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
              {
-                stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
+                // Note: AtomLocality::All is used for the non-DD case because, as in this
+                // case copyForcesToGpu() uses a separate stream, it allows overlap of
+                // CPU force H2D with GPU force tasks on all streams including those in the
+                // local stream which would otherwise be implicit dependencies for the
+                // transfer and would not overlap.
+                auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
+
+                stateGpu->copyForcesToGpu(forceWithShift, locality);
                  dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
-                        AtomLocality::Local, useGpuFBufOps == BufferOpsUseGpu::True));
+                        locality, useGpuFBufOps == BufferOpsUseGpu::True));
              }
              if (useGpuForcesHaloExchange)
              {
author	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 2 Dec 2019 18:29:28 +0000 (19:29 +0100)
committer	Berk Hess <hess@kth.se>
	Thu, 12 Dec 2019 21:12:25 +0000 (22:12 +0100)