Merge commit d30f2cb6 from release-2020 into master

[alexxy/gromacs.git] / src / gromacs / mdlib / sim_util.cpp
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 8e0541bc8634e80e9a2c7999c16b286e60a88948..938e774d5d85a65cd365b78d3ffbc2d2eb7d925f 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013-2019, by the GROMACS development team, led by
+ * Copyright (c) 2013-2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -1031,14 +1031,36 @@ void do_force(FILE*                               fplog,
          }
      }
  
-    // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on
-    // the CPU, or for the computation of virial. At search steps the current coordinates are
-    // already on the host, hence copy is not needed.
+    // TODO Update this comment when introducing SimulationWorkload
+    //
+    // The conditions for gpuHaloExchange e.g. using GPU buffer
+    // operations were checked before construction, so here we can
+    // just use it and assert upon any conditions.
+    gmx::GpuHaloExchange* gpuHaloExchange =
+            (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
+    const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
+    GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
+               "Must use coordinate buffer ops with GPU halo exchange");
+    const bool useGpuForcesHaloExchange =
+            ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
+
+    // Copy coordinate from the GPU if update is on the GPU and there
+    // are forces to be computed on the CPU, or for the computation of
+    // virial, or if host-side data will be transferred from this task
+    // to a remote task for halo exchange or PME-PP communication. At
+    // search steps the current coordinates are already on the host,
+    // hence copy is not needed.
+    const bool haveHostPmePpComms =
+            !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
+    const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
      if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
-        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
+        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
+            || haveHostPmePpComms || haveHostHaloExchangeComms))
      {
          stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+        haveCopiedXFromGpu = true;
      }
  
      const auto localXReadyOnDevice = (stateGpu != nullptr)
@@ -1057,6 +1079,16 @@ void do_force(FILE*                               fplog,
          bool reinitGpuPmePpComms = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
          bool sendCoordinatesFromGpu =
                  simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
+
+        if (!stepWork.doNeighborSearch && simulationWork.useGpuUpdate && !sendCoordinatesFromGpu)
+        {
+            GMX_RELEASE_ASSERT(false,
+                               "GPU update and separate PME ranks are only supported with GPU "
+                               "direct communication!");
+            // TODO: when this code-path becomes supported add:
+            // stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+        }
+
          gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
                                   lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
                                   step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
@@ -1173,6 +1205,13 @@ void do_force(FILE*                               fplog,
          }
          else
          {
+            if (simulationWork.useGpuUpdate)
+            {
+                GMX_ASSERT(stateGpu, "need a valid stateGpu object");
+                GMX_ASSERT(haveCopiedXFromGpu,
+                           "a wait should only be triggered if copy has been scheduled");
+                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+            }
              nbv->convertCoordinates(AtomLocality::Local, false, x.unpaddedArrayRef());
          }
      }
@@ -1219,19 +1258,6 @@ void do_force(FILE*                               fplog,
          launchPmeGpuFftAndGather(fr->pmedata, wcycle);
      }
  
-    // TODO Update this comment when introducing SimulationWorkload
-    //
-    // The conditions for gpuHaloExchange e.g. using GPU buffer
-    // operations were checked before construction, so here we can
-    // just use it and assert upon any conditions.
-    gmx::GpuHaloExchange* gpuHaloExchange =
-            (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
-    const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
-    GMX_ASSERT(!ddUsesGpuDirectCommunication || (useGpuXBufOps == BufferOpsUseGpu::True),
-               "Must use coordinate buffer ops with GPU halo exchange");
-    const bool useGpuForcesHaloExchange =
-            ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
-
      /* Communicate coordinates and sum dipole if necessary +
         do non-local pair search */
      if (havePPDomainDecomposition(cr))
@@ -1268,6 +1294,10 @@ void do_force(FILE*                               fplog,
              }
              else
              {
+                // Note: GPU update + DD without direct communication is not supported,
+                // a waitCoordinatesReadyOnHost() should be issued if it will be.
+                GMX_ASSERT(!simulationWork.useGpuUpdate,
+                           "GPU update is not supported with halo exchange");
                  dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
              }
  
@@ -1380,6 +1410,14 @@ void do_force(FILE*                               fplog,
          dd_force_flop_start(cr->dd, nrnb);
      }
  
+    // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
+    // this wait ensures that the D2H transfer is complete.
+    if ((simulationWork.useGpuUpdate)
+        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
+    {
+        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+    }
+
      if (inputrec->bRot)
      {
          wallcycle_start(wcycle, ewcROT);
@@ -1634,9 +1672,10 @@ void do_force(FILE*                               fplog,
          wallcycle_stop(wcycle, ewcFORCE);
      }
  
-    // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
-    // TODO refoactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication)
+    // If on GPU PME-PP comms or GPU update path, receive forces from PME before GPU buffer ops
+    // TODO refactor this and unify with below default-path call to the same function
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME)
+        && (simulationWork.useGpuPmePpCommunication || simulationWork.useGpuUpdate))
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
@@ -1694,9 +1733,16 @@ void do_force(FILE*                               fplog,
              //   These should be unified.
              if (haveLocalForceContribInCpuBuffer && !useGpuForcesHaloExchange)
              {
-                stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
+                // Note: AtomLocality::All is used for the non-DD case because, as in this
+                // case copyForcesToGpu() uses a separate stream, it allows overlap of
+                // CPU force H2D with GPU force tasks on all streams including those in the
+                // local stream which would otherwise be implicit dependencies for the
+                // transfer and would not overlap.
+                auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
+
+                stateGpu->copyForcesToGpu(forceWithShift, locality);
                  dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
-                        AtomLocality::Local, useGpuFBufOps == BufferOpsUseGpu::True));
+                        locality, useGpuFBufOps == BufferOpsUseGpu::True));
              }
              if (useGpuForcesHaloExchange)
              {
@@ -1753,8 +1799,9 @@ void do_force(FILE*                               fplog,
          }
      }
  
-    // TODO refoactor this and unify with above PME-PP GPU communication path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication)
+    // TODO refactor this and unify with above GPU PME-PP / GPU update path call to the same function
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication
+        && !simulationWork.useGpuUpdate)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.