Allow x D2H to overlap with GPU force compute

author Szilárd Páll <pall.szilard@gmail.com>

Tue, 10 Dec 2019 11:24:32 +0000 (12:24 +0100)

committer Paul Bauer <paul.bauer.q@gmail.com>

Wed, 18 Dec 2019 13:30:27 +0000 (14:30 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Tue, 10 Dec 2019 11:24:32 +0000 (12:24 +0100)
committer Paul Bauer <paul.bauer.q@gmail.com>
Wed, 18 Dec 2019 13:30:27 +0000 (14:30 +0100)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 86549f915ba519384fcda1e4451647aa8d3aabd3..d8d592aeb47d89aba4184da61372636ac89caadb 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1054,12 +1054,14 @@ void do_force(FILE*                               fplog,
      const bool haveHostPmePpComms =
              !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
      const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
      if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
          && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
              || haveHostPmePpComms || haveHostHaloExchangeComms))
      {
          stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+        haveCopiedXFromGpu = true;
      }
  
      const auto localXReadyOnDevice = (stateGpu != nullptr)
@@ -1078,6 +1080,16 @@ void do_force(FILE*                               fplog,
          bool reinitGpuPmePpComms = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
          bool sendCoordinatesFromGpu =
                  simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
+
+        if (simulationWork.useGpuUpdate && !sendCoordinatesFromGpu)
+        {
+            GMX_RELEASE_ASSERT(false,
+                               "GPU update and separate PME ranks are only supported with GPU "
+                               "direct communication!");
+            // TODO: when this code-path becomes supported add:
+            // stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+        }
+
          gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
                                   lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
                                   step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
@@ -1197,6 +1209,13 @@ void do_force(FILE*                               fplog,
          }
          else
          {
+            if (simulationWork.useGpuUpdate)
+            {
+                GMX_ASSERT(stateGpu, "need a valid stateGpu object");
+                GMX_ASSERT(haveCopiedXFromGpu,
+                           "a wait should only be triggered if copy has been scheduled");
+                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+            }
              nbv->convertCoordinates(AtomLocality::Local, false, x.unpaddedArrayRef());
          }
      }
@@ -1279,6 +1298,10 @@ void do_force(FILE*                               fplog,
              }
              else
              {
+                // Note: GPU update + DD without direct communication is not supported,
+                // a waitCoordinatesReadyOnHost() should be issued if it will be.
+                GMX_ASSERT(!simulationWork.useGpuUpdate,
+                           "GPU update is not supported with halo exchange");
                  dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
              }
  
@@ -1391,6 +1414,14 @@ void do_force(FILE*                               fplog,
          dd_force_flop_start(cr->dd, nrnb);
      }
  
+    // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
+    // this wait ensures that the D2H transfer is complete.
+    if ((simulationWork.useGpuUpdate)
+        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
+    {
+        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+    }
+
      if (inputrec->bRot)
      {
          wallcycle_start(wcycle, ewcROT);
author	Szilárd Páll <pall.szilard@gmail.com>
	Tue, 10 Dec 2019 11:24:32 +0000 (12:24 +0100)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Wed, 18 Dec 2019 13:30:27 +0000 (14:30 +0100)