Allow x D2H to overlap with GPU force compute
authorSzilárd Páll <pall.szilard@gmail.com>
Tue, 10 Dec 2019 11:24:32 +0000 (12:24 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Wed, 18 Dec 2019 13:30:27 +0000 (14:30 +0100)
With GPU update coordinates are transferred back to the CPU every step
if there are forces to compute on the CPU. Originally this was
implemented with a back-to-back transfer launch and wait at the
beginning of do_force().
This change moves the CPU wait for the completion of the coordinate
transfer closer to the consumer tasks in order to avoid blocking GPU
force tasks' launch and allowing compute and transfer to overlap.

Fixes #3221

Change-Id: Ia6641147bbec1186b54c1445d36dc31000eae9c4

src/gromacs/mdlib/sim_util.cpp

index 86549f915ba519384fcda1e4451647aa8d3aabd3..d8d592aeb47d89aba4184da61372636ac89caadb 100644 (file)
@@ -1054,12 +1054,14 @@ void do_force(FILE*                               fplog,
     const bool haveHostPmePpComms =
             !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication;
     const bool haveHostHaloExchangeComms = havePPDomainDecomposition(cr) && !ddUsesGpuDirectCommunication;
+
+    bool gmx_used_in_debug haveCopiedXFromGpu = false;
     if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch
         && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial
             || haveHostPmePpComms || haveHostHaloExchangeComms))
     {
         stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
-        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+        haveCopiedXFromGpu = true;
     }
 
     const auto localXReadyOnDevice = (stateGpu != nullptr)
@@ -1078,6 +1080,16 @@ void do_force(FILE*                               fplog,
         bool reinitGpuPmePpComms = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
         bool sendCoordinatesFromGpu =
                 simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
+
+        if (simulationWork.useGpuUpdate && !sendCoordinatesFromGpu)
+        {
+            GMX_RELEASE_ASSERT(false,
+                               "GPU update and separate PME ranks are only supported with GPU "
+                               "direct communication!");
+            // TODO: when this code-path becomes supported add:
+            // stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+        }
+
         gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()), lambda[efptCOUL],
                                  lambda[efptVDW], (stepWork.computeVirial || stepWork.computeEnergy),
                                  step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
@@ -1197,6 +1209,13 @@ void do_force(FILE*                               fplog,
         }
         else
         {
+            if (simulationWork.useGpuUpdate)
+            {
+                GMX_ASSERT(stateGpu, "need a valid stateGpu object");
+                GMX_ASSERT(haveCopiedXFromGpu,
+                           "a wait should only be triggered if copy has been scheduled");
+                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+            }
             nbv->convertCoordinates(AtomLocality::Local, false, x.unpaddedArrayRef());
         }
     }
@@ -1279,6 +1298,10 @@ void do_force(FILE*                               fplog,
             }
             else
             {
+                // Note: GPU update + DD without direct communication is not supported,
+                // a waitCoordinatesReadyOnHost() should be issued if it will be.
+                GMX_ASSERT(!simulationWork.useGpuUpdate,
+                           "GPU update is not supported with halo exchange");
                 dd_move_x(cr->dd, box, x.unpaddedArrayRef(), wcycle);
             }
 
@@ -1391,6 +1414,14 @@ void do_force(FILE*                               fplog,
         dd_force_flop_start(cr->dd, nrnb);
     }
 
+    // For the rest of the CPU tasks that depend on GPU-update produced coordinates,
+    // this wait ensures that the D2H transfer is complete.
+    if ((simulationWork.useGpuUpdate)
+        && (runScheduleWork->domainWork.haveCpuLocalForceWork || stepWork.computeVirial))
+    {
+        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+    }
+
     if (inputrec->bRot)
     {
         wallcycle_start(wcycle, ewcROT);