Remove excessive H2D and D2H copies of forces when update is offloaded

author Artem Zhmurov <zhmurov@gmail.com>

Thu, 17 Oct 2019 13:36:09 +0000 (15:36 +0200)

committer Berk Hess <hess@kth.se>

Thu, 31 Oct 2019 07:16:42 +0000 (08:16 +0100)
author Artem Zhmurov <zhmurov@gmail.com>
Thu, 17 Oct 2019 13:36:09 +0000 (15:36 +0200)
committer Berk Hess <hess@kth.se>
Thu, 31 Oct 2019 07:16:42 +0000 (08:16 +0100)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 632f2c76be8a25bb0de25e967de6d6b21829a1b2..b5670683d4601fbb374c971306d80f945f1a0587 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1758,8 +1758,17 @@ void do_force(FILE                                     *fplog,
                                                pmeForcePtr,
                                                dependencyList,
                                                stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
-            stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
-            stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
+            // Copy forces to host if they are needed for update or if virtual sites are enabled.
+            // If there are vsites, we need to copy forces every step to spread vsite forces on host.
+            // TODO: When the output flags will be included in step workload, this copy can be combined with the
+            //       copy call done in sim_utils(...) for the output.
+            // NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
+            //       they should not be copied in do_md(...) for the output.
+            if (!simulationWork.useGpuUpdate || vsite)
+            {
+                stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
+                stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
+            }
          }
          else
          {
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 4435129be071c08533776f5dab71ae398ed26f96..82125e4fb54769c8dd1bfe9051513a18d4488c94 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -1170,6 +1170,22 @@ void gmx::LegacySimulator::do_md()
              stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
              stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
          }
+
+        // Copy forces for the output if the forces were reduced on the GPU (not the case on virial steps)
+        // and update is offloaded hence forces are kept on the GPU for update and have not been
+        // already transferred in do_force().
+        // TODO: There should be an improved, explicit mechanism that ensures this copy is only executed
+        //       when the forces are ready on the GPU -- the same synchronizer should be used as the one
+        //       prior to GPU update.
+        // TODO: When the output flags will be included in step workload, this copy can be combined with the
+        //       copy call in do_force(...).
+        // NOTE: The forces should not be copied here if the vsites are present, since they were modified
+        //       on host after the D2H copy in do_force(...).
+        if (runScheduleWork->stepWork.useGpuFBufferOps && (simulationWork.useGpuUpdate && !vsite) && do_per_step(step, ir->nstfout))
+        {
+            stateGpu->copyForcesFromGpu(ArrayRef<RVec>(f), AtomLocality::Local);
+            stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
+        }
          /* Now we have the energies and forces corresponding to the
           * coordinates at time t. We must output all of this before
           * the update.
@@ -1293,16 +1309,17 @@ void gmx::LegacySimulator::do_md()
                  stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
              }
  
-            stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::All);
-
-            // TODO: Use StepWorkload fields.
-            bool useGpuFBufferOps = simulationWork.useGpuBufferOps && !(bCalcVir || bCalcEner);
+            // If the buffer ops were not offloaded this step, the forces are on the host and have to be copied
+            if (!runScheduleWork->stepWork.useGpuFBufferOps)
+            {
+                stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::Local);
+            }
  
              bool doTempCouple       = (ir->etc != etcNO && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
              bool doParrinelloRahman = (ir->epc == epcPARRINELLORAHMAN && do_per_step(step + ir->nstpcouple - 1, ir->nstpcouple));
  
              // This applies Leap-Frog, LINCS and SETTLE in succession
-            integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, useGpuFBufferOps),
+            integrator->integrate(stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
                                    ir->delta_t, true, bCalcVir, shake_vir,
                                    doTempCouple, ekind->tcstat,
                                    doParrinelloRahman, ir->nstpcouple*ir->delta_t, M);
author	Artem Zhmurov <zhmurov@gmail.com>
	Thu, 17 Oct 2019 13:36:09 +0000 (15:36 +0200)
committer	Berk Hess <hess@kth.se>
	Thu, 31 Oct 2019 07:16:42 +0000 (08:16 +0100)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history