Balance event consumption for GPU update code path
authorAndrey Alekseenko <al42and@gmail.com>
Tue, 19 Oct 2021 16:34:43 +0000 (18:34 +0200)
committerSzilárd Páll <pall.szilard@gmail.com>
Thu, 21 Oct 2021 17:44:14 +0000 (17:44 +0000)
- Explicit consumption of events when needed.
- In some cases, reset the event in case the conditionals
  for the consumption are too complex.
- Added helper functions to StatePropagatorDataGpu.
- A special handling for force reduction when it has no atoms.

Refs #3988, #4106, #4227.

src/gromacs/mdlib/gpuforcereduction_impl.cpp
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/md.cpp
src/gromacs/mdtypes/state_propagator_data_gpu.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index 2a3e48c4ad5afeb75ffaef6537130904bdb1bb31..3de3002b3ac3c72a6b79de09e1b59b92d8791543 100644 (file)
@@ -139,14 +139,27 @@ void GpuForceReduction::Impl::execute()
                                    baseForce_,
                                    cellInfo_.d_cell,
                                    deviceStream_);
-
-        // Mark that kernel has been launched
-        if (completionMarker_ != nullptr)
+    }
+    else
+    {
+        /* In case we have nothing to do, but still have dependencies, we need
+         * to consume them and mark our own event.
+         * Happens sometimes in MdrunVsitesTest.
+         * Issue #3988, #4227. */
+        for (auto* synchronizer : dependencyList_)
         {
-            completionMarker_->markEvent(deviceStream_);
+            synchronizer->consume();
         }
     }
 
+    /* Mark that kernel has been launched.
+     * Even if we have no work to do and have not launched the kernel, we still mark the event
+     * in order to ensure proper marking/consumption balance, see Issue #3988, #4227. */
+    if (completionMarker_ != nullptr)
+    {
+        completionMarker_->markEvent(deviceStream_);
+    }
+
     wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
     wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
index a2ccaed47b15b75440558e9d7ad4f433f519a765..e5c3d9bad49c334dc6b586b90672bb3c6e3994bf 100644 (file)
@@ -1375,6 +1375,21 @@ void do_force(FILE*                               fplog,
         {
             GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
             stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+            if (stepWork.doNeighborSearch)
+            {
+                /* On NS steps, we skip X buffer ops. So, unless we use PME or direct GPU
+                 * communications, we don't wait for the coordinates on the device,
+                 * and we must consume the event here.
+                 * Issue #3988. */
+                const bool eventWillBeConsumedByGpuPme = stepWork.haveGpuPmeOnThisRank;
+                const bool eventWillBeConsumedByGpuPmePPComm =
+                        (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
+                        && pmeSendCoordinatesFromGpu;
+                if (!eventWillBeConsumedByGpuPme && !eventWillBeConsumedByGpuPmePPComm)
+                {
+                    stateGpu->consumeCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
+                }
+            }
         }
     }
 
@@ -2064,6 +2079,10 @@ void do_force(FILE*                               fplog,
 
                 if (!stepWork.useGpuFHalo)
                 {
+                    /* We don't explicitly wait for the forces to be reduced on device,
+                     * but wait for them to finish copying to CPU instead.
+                     * So, we manually consume the event, see Issue #3988. */
+                    stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
                     // copy from GPU input for dd_move_f()
                     stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
                                                 AtomLocality::NonLocal);
@@ -2271,6 +2290,13 @@ void do_force(FILE*                               fplog,
                 || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
                 || vsite)
             {
+                if (stepWork.computeNonbondedForces)
+                {
+                    /* We have previously issued force reduction on the GPU, but we will
+                     * not use this event, instead relying on the stream being in-order.
+                     * Issue #3988. */
+                    stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
+                }
                 stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
                 stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
             }
index 0a59a4f2535bca973a34366ef336b2dce519ee23..3898e07d3af54303c6059984bfc88b8a3e7a430f 100644 (file)
@@ -1707,6 +1707,8 @@ void gmx::LegacySimulator::do_md()
                     // TODO: The special case of removing CM motion should be dealt more gracefully
                     if (useGpuForUpdate)
                     {
+                        // Issue #3988, #4106.
+                        stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
                         stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
                         // Here we block until the H2D copy completes because event sync with the
                         // force kernels that use the coordinates on the next steps is not implemented
index ad07df2c132236c3763e2b7ce22d14f76506e454..4cf4c7998d28a915907fbc4d1f65c596db643d1e 100644 (file)
@@ -208,6 +208,22 @@ public:
      */
     void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
 
+    /*! \brief Consume the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except changing the internal event counter.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Reset the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except resetting the event.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
     /*! \brief Setter for the event synchronizer for the update is done on th GPU
      *
      *  \param[in] xUpdatedOnDeviceEvent  The event to synchronize the stream coordinates wre updated on device.
@@ -302,6 +318,12 @@ public:
      */
     GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
 
+    /*! \brief Consume the event for when the forces are reduced on the GPU.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     */
+    void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
     /*! \brief Getter for the event synchronizer for the forces are ready on the GPU.
      *
      *  \param[in] atomLocality      Locality of the particles to wait for.
index 618856d0ce1a6fb650ce9cc5cdcd2c6f18cd9894..99c456277b9108543579c0f5ba94b3146f5e36b2 100644 (file)
@@ -119,6 +119,20 @@ void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality /* atomL
                "GPU implementation.");
 }
 
+void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality /* atomLocality */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
+void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality /* atomLocality */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
 void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* /* xUpdatedOnDeviceEvent */)
 {
     GMX_ASSERT(!impl_,
@@ -223,6 +237,13 @@ GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality /*at
     return nullptr;
 }
 
+void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality /*atomLocality*/)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality /*atomLocality*/)
 {
     GMX_ASSERT(!impl_,
index dbe435d508ddd13c54aeec0eeb7c21d26cae7784..d72bc5a0f27f0a4277ef2f1f51d711d0c07795ad 100644 (file)
@@ -192,6 +192,22 @@ public:
      */
     void waitCoordinatesCopiedToDevice(AtomLocality atomLocality);
 
+    /*! \brief Consume the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except changing the internal event counter.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
+    /*! \brief Reset the event for copying coordinates to the device.
+     *
+     * Used for manual event consumption. Does nothing except resetting the event.
+     *
+     *  \param[in] atomLocality  Locality of the particles.
+     */
+    void resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality);
+
     /*! \brief Setter for the event synchronizer for the update is done on th GPU
      *
      *  \param[in] xUpdatedOnDeviceEvent  The event to synchronize the stream coordinates wre updated on device.
@@ -284,6 +300,9 @@ public:
      */
     GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
 
+    //! \brief Consume the event for when the forces are reduced on device.
+    void consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality);
+
     /*! \brief Getter for the event synchronizer for the forces are ready for GPU update.
      *
      *  \param[in] atomLocality      Locality of the particles to wait for.
index 30478ef046c20b76599195554696c9f363e9808e..e6db89f1a99512b6cf0592f68d7c5f95987c4a9e 100644 (file)
@@ -372,6 +372,16 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceE
     }
     else
     {
+        if (stepWork.doNeighborSearch && xUpdatedOnDeviceEvent_)
+        {
+            /* On search steps, we do not consume the result of the GPU update
+             * but rather that of a H2D transfer. So, we reset the event triggered after
+             * update to avoid leaving it unconsumed.
+             * Unfortunately, we don't always have the event marked either (e.g., on the
+             * first step) so we just reset it here.
+             * See Issue #3988. */
+            xUpdatedOnDeviceEvent_->reset();
+        }
         return &xReadyOnDevice_[atomLocality];
     }
 }
@@ -384,6 +394,18 @@ void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality at
     wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
 }
 
+void StatePropagatorDataGpu::Impl::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    xReadyOnDevice_[atomLocality].consume();
+}
+
+void StatePropagatorDataGpu::Impl::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+    GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
+    xReadyOnDevice_[atomLocality].reset();
+}
+
 void StatePropagatorDataGpu::Impl::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
 {
     GMX_ASSERT(xUpdatedOnDeviceEvent != nullptr, "The event synchronizer can not be nullptr.");
@@ -551,6 +573,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice(AtomLocalit
     return &fReducedOnDevice_[atomLocality];
 }
 
+void StatePropagatorDataGpu::Impl::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality)
+{
+    fReducedOnDevice_[atomLocality].consume();
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality)
 {
     return &fReadyOnDevice_[atomLocality];
@@ -657,6 +684,16 @@ void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLoca
     return impl_->waitCoordinatesCopiedToDevice(atomLocality);
 }
 
+void StatePropagatorDataGpu::consumeCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+    return impl_->consumeCoordinatesCopiedToDeviceEvent(atomLocality);
+}
+
+void StatePropagatorDataGpu::resetCoordinatesCopiedToDeviceEvent(AtomLocality atomLocality)
+{
+    return impl_->resetCoordinatesCopiedToDeviceEvent(atomLocality);
+}
+
 void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent)
 {
     impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent);
@@ -723,6 +760,11 @@ GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality atom
     return impl_->fReducedOnDevice(atomLocality);
 }
 
+void StatePropagatorDataGpu::consumeForcesReducedOnDeviceEvent(AtomLocality atomLocality)
+{
+    impl_->consumeForcesReducedOnDeviceEvent(atomLocality);
+}
+
 GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality)
 {
     return impl_->fReadyOnDevice(atomLocality);