Balance event consumption for GPU update code path

author Andrey Alekseenko <al42and@gmail.com>

Tue, 19 Oct 2021 16:34:43 +0000 (18:34 +0200)

committer Szilárd Páll <pall.szilard@gmail.com>

Thu, 21 Oct 2021 17:44:14 +0000 (17:44 +0000)
author Andrey Alekseenko <al42and@gmail.com>
Tue, 19 Oct 2021 16:34:43 +0000 (18:34 +0200)
committer Szilárd Páll <pall.szilard@gmail.com>
Thu, 21 Oct 2021 17:44:14 +0000 (17:44 +0000)
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cpp b/src/gromacs/mdlib/gpuforcereduction_impl.cpp

index 2a3e48c4ad5afeb75ffaef6537130904bdb1bb31..3de3002b3ac3c72a6b79de09e1b59b92d8791543 100644 (file)
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cpp
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cpp
@@ -139,14 +139,27 @@ void GpuForceReduction::Impl::execute()
                                     baseForce_,
                                     cellInfo_.d_cell,
                                     deviceStream_);
-
-        // Mark that kernel has been launched
-        if (completionMarker_ != nullptr)
+    }
+    else
+    {
+        /* In case we have nothing to do, but still have dependencies, we need
+         * to consume them and mark our own event.
+         * Happens sometimes in MdrunVsitesTest.
+         * Issue #3988, #4227. */
+        for (auto* synchronizer : dependencyList_)
          {
-            completionMarker_->markEvent(deviceStream_);
+            synchronizer->consume();
          }
      }
  
+    /* Mark that kernel has been launched.
+     * Even if we have no work to do and have not launched the kernel, we still mark the event
+     * in order to ensure proper marking/consumption balance, see Issue #3988, #4227. */
+    if (completionMarker_ != nullptr)
+    {
+        completionMarker_->markEvent(deviceStream_);
+    }
+
      wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
  }
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index a2ccaed47b15b75440558e9d7ad4f433f519a765..e5c3d9bad49c334dc6b586b90672bb3c6e3994bf 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1375,6 +1375,21 @@ void do_force(FILE*                               fplog,
          {
              GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
              stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+            if (stepWork.doNeighborSearch)
+            {
+                /* On NS steps, we skip X buffer ops. So, unless we use PME or direct GPU
+                 * communications, we don't wait for the coordinates on the device,
+                 * and we must consume the event here.
+                 * Issue #3988. */
+                const bool eventWillBeConsumedByGpuPme = stepWork.haveGpuPmeOnThisRank;
+                const bool eventWillBeConsumedByGpuPmePPComm =
+                        (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
+                        && pmeSendCoordinatesFromGpu;
+                if (!eventWillBeConsumedByGpuPme && !eventWillBeConsumedByGpuPmePPComm)
+                {
+                    stateGpu->consumeCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
+                }
+            }
          }
      }
  
@@ -2064,6 +2079,10 @@ void do_force(FILE*                               fplog,
  
                  if (!stepWork.useGpuFHalo)
                  {
+                    /* We don't explicitly wait for the forces to be reduced on device,
+                     * but wait for them to finish copying to CPU instead.
+                     * So, we manually consume the event, see Issue #3988. */
+                    stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
                      // copy from GPU input for dd_move_f()
                      stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
                                                  AtomLocality::NonLocal);
@@ -2271,6 +2290,13 @@ void do_force(FILE*                               fplog,
                  || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
                  || vsite)
              {
+                if (stepWork.computeNonbondedForces)
+                {
+                    /* We have previously issued force reduction on the GPU, but we will
+                     * not use this event, instead relying on the stream being in-order.
+                     * Issue #3988. */
+                    stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
+                }
                  stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
                  stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
              }
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 0a59a4f2535bca973a34366ef336b2dce519ee23..3898e07d3af54303c6059984bfc88b8a3e7a430f 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -1707,6 +1707,8 @@ void gmx::LegacySimulator::do_md()
                      // TODO: The special case of removing CM motion should be dealt more gracefully
                      if (useGpuForUpdate)
                      {
+                        // Issue #3988, #4106.
+                        stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
                          stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
                          // Here we block until the H2D copy completes because event sync with the
                          // force kernels that use the coordinates on the next steps is not implemented
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index ad07df2c132236c3763e2b7ce22d14f76506e454..4cf4c7998d28a915907fbc4d1f65c596db643d1e 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -208,6 +208,22 @@ public:
       */
      void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
  
+    /*! \brief Consume the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except changing the internal event counter.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Reset the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except resetting the event.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
      /*! \brief Setter for the event synchronizer for the update is done on th GPU
       *
       *  \param[in] xUpdatedOnDeviceEvent  The event to synchronize the stream coordinates wre updated on device.
@@ -302,6 +318,12 @@ public:
       */
      GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
  
+    /*! \brief Consume the event for when the forces are reduced on the GPU.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     */
+    void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
      /*! \brief Getter for the event synchronizer for the forces are ready on the GPU.
       *
       *  \param[in] atomLocality      Locality of the particles to wait for.
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index 618856d0ce1a6fb650ce9cc5cdcd2c6f18cd9894..99c456277b9108543579c0f5ba94b3146f5e36b2 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -119,6 +119,20 @@ void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality /* atomL
                 "GPU implementation.");
  }
  
+void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality /* atomLocality */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
+void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality /* atomLocality */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
  void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* /* xUpdatedOnDeviceEvent */)
  {
      GMX_ASSERT(!impl_,
@@ -223,6 +237,13 @@ GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality /*at
      return nullptr;
  }
  
+void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality /*atomLocality*/)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality /*atomLocality*/)
  {
      GMX_ASSERT(!impl_,
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index dbe435d508ddd13c54aeec0eeb7c21d26cae7784..d72bc5a0f27f0a4277ef2f1f51d711d0c07795ad 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -192,6 +192,22 @@ public:
       */
      void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
  
+    /*! \brief Consume the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except changing the internal event counter.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Reset the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except resetting the event.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
      /*! \brief Setter for the event synchronizer for the update is done on th GPU
       *
       *  \param[in] xUpdatedOnDeviceEvent  The event to synchronize the stream coordinates wre updated on device.
@@ -284,6 +300,9 @@ public:
       */
      GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
  
+    //! \brief Consume the event for when the forces are reduced on device.
+    void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
      /*! \brief Getter for the event synchronizer for the forces are ready for GPU update.
       *
       *  \param[in] atomLocality      Locality of the particles to wait for.
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 30478ef046c20b76599195554696c9f363e9808e..e6db89f1a99512b6cf0592f68d7c5f95987c4a9e 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -372,6 +372,16 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
      }
      else
      {
+        if (stepWork.doNeighborSearch && xUpdatedOnDeviceEvent_)
+        {
+            /* On search steps, we do not consume the result of the GPU update
+             * but rather that of a H2D transfer. So, we reset the event triggered after
+             * update to avoid leaving it unconsumed.
+             * Unfortunately, we don't always have the event marked either (e.g., on the
+             * first step) so we just reset it here.
+             * See Issue #3988. */
+            xUpdatedOnDeviceEvent_->reset();
+        }
          return &xReadyOnDevice_[atomLocality];
      }
  }
@@ -384,6 +394,18 @@ void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality at
      wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
  }
  
+void StatePropagatorDataGpu::Impl::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    xReadyOnDevice_[atomLocality].consume();
+}
+
+void StatePropagatorDataGpu::Impl::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    xReadyOnDevice_[atomLocality].reset();
+}
+
  void StatePropagatorDataGpu::Impl::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
  {
      GMX_ASSERT(xUpdatedOnDeviceEvent != nullptr, "The event synchronizer can not be nullptr.");
@@ -551,6 +573,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice(AtomLocalit
      return &fReducedOnDevice_[atomLocality];
  }
  
+void StatePropagatorDataGpu::Impl::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality)
+{
+    fReducedOnDevice_[atomLocality].consume();
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality)
  {
      return &fReadyOnDevice_[atomLocality];
@@ -657,6 +684,16 @@ void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLoca
      return impl_->waitCoordinatesCopiedToDevice(atomLocality);
  }
  
+void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+    return impl_->consumeCoordinatesCopiedToDeviceEvent(atomLocality);
+}
+
+void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+    return impl_->resetCoordinatesCopiedToDeviceEvent(atomLocality);
+}
+
  void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
  {
      impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent);
@@ -723,6 +760,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality atom
      return impl_->fReducedOnDevice(atomLocality);
  }
  
+void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality)
+{
+    impl_->consumeForcesReducedOnDeviceEvent(atomLocality);
+}
+
  GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality)
  {
      return impl_->fReadyOnDevice(atomLocality);
author	Andrey Alekseenko <al42and@gmail.com>
	Tue, 19 Oct 2021 16:34:43 +0000 (18:34 +0200)
committer	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 21 Oct 2021 17:44:14 +0000 (17:44 +0000)
src/gromacs/mdlib/gpuforcereduction_impl.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history