Rework GPU halo and state propagator streams and dependencies to get better overlap

author Alan Gray <alangray3@gmail.com>

Wed, 6 Oct 2021 09:26:43 +0000 (09:26 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 6 Oct 2021 09:26:43 +0000 (09:26 +0000)
author Alan Gray <alangray3@gmail.com>
Wed, 6 Oct 2021 09:26:43 +0000 (09:26 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 6 Oct 2021 09:26:43 +0000 (09:26 +0000)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index 8a634b7c07594c99bb43fb46e62557beaeacab4e..f063f3b0e75cc38a6edb6364e4cd1c1fd6d3680e 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -3205,14 +3205,7 @@ void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
          for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
          {
              cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
-                    cr.dd,
-                    d,
-                    cr.mpi_comm_mygroup,
-                    deviceStreamManager.context(),
-                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
-                    deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
-                    pulse,
-                    wcycle));
+                    cr.dd, d, cr.mpi_comm_mygroup, deviceStreamManager.context(), pulse, wcycle));
          }
      }
  }
@@ -3230,26 +3223,31 @@ void reinitGpuHaloExchange(const t_commrec&              cr,
      }
  }
  
-void communicateGpuHaloCoordinates(const t_commrec&      cr,
-                                   const matrix          box,
-                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec&      cr,
+                                                    const matrix          box,
+                                                    GpuEventSynchronizer* dependencyEvent)
  {
+    GpuEventSynchronizer* eventPtr = dependencyEvent;
      for (int d = 0; d < cr.dd->ndim; d++)
      {
          for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
          {
-            cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+            eventPtr = cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, eventPtr);
          }
      }
+    return eventPtr;
  }
  
-void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
+void communicateGpuHaloForces(const t_commrec&                                    cr,
+                              bool                                                accumulateForces,
+                              gmx::FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
  {
      for (int d = cr.dd->ndim - 1; d >= 0; d--)
      {
          for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
          {
-            cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
+            cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces, dependencyEvents);
+            dependencyEvents->push_back(cr.dd->gpuHaloExchange[d][pulse]->getForcesReadyOnDeviceEvent());
          }
      }
  }
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h

index bb964de98eba6b4cd6cd61933e4cd92542c7693d..c7d7483cbec6c7ae68d8d6edeacc542223cba379 100644 (file)
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -92,6 +92,8 @@ class RangePartitioning;
  class VirtualSitesHandler;
  template<typename>
  class ArrayRef;
+template<typename, size_t>
+class FixedCapacityVector;
  } // namespace gmx
  
  /*! \brief Returns the global topology atom number belonging to local atom index i.
@@ -265,20 +267,24 @@ void reinitGpuHaloExchange(const t_commrec&        cr,
  
  
  /*! \brief GPU halo exchange of coordinates buffer.
- * \param [in] cr                             The commrec object
- * \param [in] box                            Coordinate box (from which shifts will be constructed)
- * \param [in] coordinatesReadyOnDeviceEvent  event recorded when coordinates have been copied to device
+ * \param [in] cr                The commrec object
+ * \param [in] box               Coordinate box (from which shifts will be constructed)
+ * \param [in] dependencyEvent   Dependency event for this operation
+ * \returns                      Event recorded when this operation has been launched
   */
-void communicateGpuHaloCoordinates(const t_commrec&      cr,
-                                   const matrix          box,
-                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
-
+GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec&      cr,
+                                                    const matrix          box,
+                                                    GpuEventSynchronizer* dependencyEvent);
  
-/*! \brief GPU halo exchange of force buffer.
- * \param [in] cr                The commrec object
+/*! \brief  Wait for copy of nonlocal part of coordinate array from GPU to CPU
+ * following coordinate halo exchange
+ * \param [in] cr   The commrec object
   * \param [in] accumulateForces  True if forces should accumulate, otherwise they are set
+ * \param [in] dependencyEvents  Dependency events for this operation
   */
-void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces);
+void communicateGpuHaloForces(const t_commrec&                                    cr,
+                              bool                                                accumulateForces,
+                              gmx::FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
  
  /*! \brief Wraps the \c positions so that atoms from the same
   * update group share the same periodic image wrt \c box.
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index adc8d667127fe4f0c14361d8fd4f4c529e052ce1..431407b6bcae2139586d15ecebdcdc544e540b5c 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -46,7 +46,9 @@
  
  #include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/fixedcapacityvector.h"
  #include "gromacs/utility/gmxmpi.h"
  
  struct gmx_domdec_t;
@@ -66,27 +68,26 @@ class GpuHaloExchange
  public:
      /*! \brief Creates GPU Halo Exchange object.
       *
-     * Coordinate Halo exchange will be performed in \c
-     * StreamNonLocal, and the \c communicateHaloCoordinates
-     * method must be called before any subsequent operations that
-     * access non-local parts of the coordinate buffer (such as
-     * the non-local non-bonded kernels). It also must be called
-     * after the local coordinates buffer operations (where the
-     * coordinates are copied to the device and hence the \c
-     * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
-     * will be performed in \c streamNonLocal and the \c
-     * communicateHaloForces method must be called after the
-     * non-local buffer operations, after the local force buffer
-     * has been copied to the GPU (if CPU forces are present), and
-     * before the local buffer operations. The force halo exchange
-     * does not yet support virial steps.
+     * Coordinate Halo exchange will be performed in its own stream
+     * with appropriate event-based synchronization, and the \c
+     * communicateHaloCoordinates method must be called before any
+     * subsequent operations that access non-local parts of the
+     * coordinate buffer (such as the non-local non-bonded
+     * kernels). It also must be called after the local coordinates
+     * buffer operations (where the coordinates are copied to the
+     * device and hence the \c coordinatesReadyOnDeviceEvent is
+     * recorded). Force Halo exchange will also be performed in its
+     * own stream with appropriate event-based synchronization, and
+     * the \c communicateHaloForces method must be called after the
+     * non-local buffer operations, after the local force buffer has
+     * been copied to the GPU (if CPU forces are present), and before
+     * the local buffer operations. The force halo exchange does not
+     * yet support virial steps.
       *
       * \param [inout] dd                       domdec structure
       * \param [in]    dimIndex                 the dimension index for this instance
       * \param [in]    mpi_comm_mysim           communicator used for simulation
       * \param [in]    deviceContext            GPU device context
-     * \param [in]    streamLocal              local NB CUDA stream.
-     * \param [in]    streamNonLocal           non-local NB CUDA stream.
       * \param [in]    pulse                    the communication pulse for this instance
       * \param [in]    wcycle                   The wallclock counter
       */
@@ -94,8 +95,6 @@ public:
                      int                  dimIndex,
                      MPI_Comm             mpi_comm_mysim,
                      const DeviceContext& deviceContext,
-                    const DeviceStream&  streamLocal,
-                    const DeviceStream&  streamNonLocal,
                      int                  pulse,
                      gmx_wallcycle*       wcycle);
      ~GpuHaloExchange();
@@ -116,15 +115,18 @@ public:
       * Must be called after local setCoordinates (which records an
       * event when the coordinate data has been copied to the
       * device).
-     * \param [in] box  Coordinate box (from which shifts will be constructed)
-     * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+     * \param [in] box               Coordinate box (from which shifts will be constructed)
+     * \param [in] dependencyEvent   Dependency event for this operation
+     * \returns                      Event recorded when this operation has been launched
       */
-    void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    GpuEventSynchronizer* communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* dependencyEvent);
  
      /*! \brief GPU halo exchange of force buffer.
       * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
+     * \param[in] dependencyEvents  Dependency events for this operation
       */
-    void communicateHaloForces(bool accumulateForces);
+    void communicateHaloForces(bool                                           accumulateForces,
+                               FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
  
      /*! \brief Get the event synchronizer for the forces ready on device.
       *  \returns  The event to synchronize the stream that consumes forces on device.
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index e00d4e3d606f96385ddc1b88a46f1fa2f588765e..5a839b27bfe84913bfd27bc19f83aa69ba0353c4 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -67,8 +67,6 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                   int /* dimIndex */,
                                   MPI_Comm /* mpi_comm_mysim */,
                                   const DeviceContext& /* deviceContext */,
-                                 const DeviceStream& /*streamLocal */,
-                                 const DeviceStream& /*streamNonLocal */,
                                   int /*pulse */,
                                   gmx_wallcycle* /*wcycle*/) :
      impl_(nullptr)
@@ -96,16 +94,18 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> /* d_coordinatesBuffer */,
  }
  
  /*!\brief apply X halo exchange stub. */
-void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
-                                                 GpuEventSynchronizer* /*coordinatesOnDeviceEvent*/)
+GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
+                                                                  GpuEventSynchronizer* /*dependencyEvent*/)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for GPU Halo Exchange exchange was called insted of the correct "
                 "implementation.");
+    return nullptr;
  }
  
  /*!\brief apply F halo exchange stub. */
-void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
+void GpuHaloExchange::communicateHaloForces(bool /* accumulateForces */,
+                                            FixedCapacityVector<GpuEventSynchronizer*, 2>* /*dependencyEvents*/)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 5796b96980873f3f4036388d021aab7d9e6b2ab3..b0dab24e8a978a689a5d44dbb9f3e7d213675512 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -200,7 +200,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
          std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
  
          copyToDeviceBuffer(
-                &d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr);
+                &d_indexMap_, h_indexMap_.data(), 0, newSize, *haloStream_, GpuApiCallBehavior::Async, nullptr);
      }
  
  #if GMX_MPI
@@ -270,19 +270,16 @@ void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynch
                   0,
                   mpi_comm_mysim_,
                   MPI_STATUS_IGNORE);
-    remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+    remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(*haloStream_);
  }
  
-void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box,
-                                                       GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box,
+                                                                        GpuEventSynchronizer* dependencyEvent)
  {
-
      wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu);
-    if (pulse_ == 0)
-    {
-        // ensure stream waits until coordinate data is available on device
-        coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
-    }
+
+    // ensure stream waits until dependency has been satisfied
+    dependencyEvent->enqueueWaitEvent(*haloStream_);
  
      wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
  
@@ -318,8 +315,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
          const auto kernelArgs = prepareGpuKernelArguments(
                  kernelFn, config, &sendBuf, &d_x, &indexMap, &size, &coordinateShift);
  
-        launchGpuKernel(
-                kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
+        launchGpuKernel(kernelFn, config, *haloStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
      }
  
      wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
@@ -331,28 +327,41 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
  
      // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
      // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
-    if (GMX_THREAD_MPI && pulse_ == 0)
+    if (GMX_THREAD_MPI && dimIndex_ == 0 && pulse_ == 0)
      {
-        enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent);
+        enqueueWaitRemoteCoordinatesReadyEvent(dependencyEvent);
      }
  
      float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
      communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
  
+    coordinateHaloLaunched_.markEvent(*haloStream_);
+
      wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
+
+    return &coordinateHaloLaunched_;
  }
  
  // The following method should be called after non-local buffer operations,
-// and before the local buffer operations. It operates in the non-local stream.
-void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
+// and before the local buffer operations.
+void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces,
+                                                  FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
  {
+
      // Consider time spent in communicateHaloData as Comm.F counter
      // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
      wallcycle_start(wcycle_, WallCycleCounter::MoveF);
  
+    while (dependencyEvents->size() > 0)
+    {
+        auto dependency = dependencyEvents->back();
+        dependency->enqueueWaitEvent(*haloStream_);
+        dependencyEvents->pop_back();
+    }
+
      float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
  
-    // Communicate halo data (in non-local stream)
+    // Communicate halo data
      communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
  
      wallcycle_stop(wcycle_, WallCycleCounter::MoveF);
@@ -361,19 +370,6 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
      wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
  
      float3* d_f = d_f_;
-    // If this is the last pulse and index (noting the force halo
-    // exchanges across multiple pulses and indices are called in
-    // reverse order) then perform the following preparation
-    // activities
-    if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
-    {
-        // ensure non-local stream waits for local stream, due to dependence on
-        // the previous H2D copy of CPU forces (if accumulateForces is true)
-        // or local force clearing.
-        GpuEventSynchronizer eventLocal;
-        eventLocal.markEvent(localStream_);
-        eventLocal.enqueueWaitEvent(nonLocalStream_);
-    }
  
      // Unpack halo buffer into force array
  
@@ -405,14 +401,10 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
          const auto kernelArgs =
                  prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size);
  
-        launchGpuKernel(
-                kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
+        launchGpuKernel(kernelFn, config, *haloStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
      }
  
-    if (pulse_ == 0)
-    {
-        fReadyOnDevice_.markEvent(nonLocalStream_);
-    }
+    fReadyOnDevice_.markEvent(*haloStream_);
  
      wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
@@ -447,12 +439,12 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaMPI(float3* sendPtr,
      // no need to wait for haloDataReadyOnDevice event if this rank is not sending any data
      if (sendSize > 0)
      {
-        // wait for non local stream to complete all outstanding
+        // wait for halo stream to complete all outstanding
          // activities, to ensure that buffer is up-to-date in GPU memory
          // before transferring to remote rank
  
          // ToDo: Replace stream synchronize with event synchronize
-        nonLocalStream_.synchronize();
+        haloStream_->synchronize();
      }
  
      // perform halo exchange directly in device buffers
@@ -491,7 +483,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
                                 sendPtr,
                                 sendSize * DIM * sizeof(float),
                                 cudaMemcpyDeviceToDevice,
-                               nonLocalStream_.stream());
+                               haloStream_->stream());
  
          CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
      }
@@ -506,7 +498,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
      GMX_ASSERT(haloDataTransferLaunched_ != nullptr,
                 "Halo exchange requires valid event to synchronize data transfer initiated in "
                 "remote rank");
-    haloDataTransferLaunched_->markEvent(nonLocalStream_);
+    haloDataTransferLaunched_->markEvent(*haloStream_);
  
      MPI_Sendrecv(&haloDataTransferLaunched_,
                   sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
@@ -521,7 +513,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr,
                   mpi_comm_mysim_,
                   MPI_STATUS_IGNORE);
  
-    haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+    haloDataTransferRemote->enqueueWaitEvent(*haloStream_);
  #else
      GMX_UNUSED_VALUE(sendRank);
      GMX_UNUSED_VALUE(recvRank);
@@ -538,8 +530,6 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
                              int                  dimIndex,
                              MPI_Comm             mpi_comm_mysim,
                              const DeviceContext& deviceContext,
-                            const DeviceStream&  localStream,
-                            const DeviceStream&  nonLocalStream,
                              int                  pulse,
                              gmx_wallcycle*       wcycle) :
      dd_(dd),
@@ -551,8 +541,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
      haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
      mpi_comm_mysim_(mpi_comm_mysim),
      deviceContext_(deviceContext),
-    localStream_(localStream),
-    nonLocalStream_(nonLocalStream),
+    haloStream_(new DeviceStream(deviceContext, DeviceStreamPriority::High, false)),
      dimIndex_(dimIndex),
      pulse_(pulse),
      wcycle_(wcycle)
@@ -580,11 +569,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
                                   int                  dimIndex,
                                   MPI_Comm             mpi_comm_mysim,
                                   const DeviceContext& deviceContext,
-                                 const DeviceStream&  localStream,
-                                 const DeviceStream&  nonLocalStream,
                                   int                  pulse,
                                   gmx_wallcycle*       wcycle) :
-    impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
+    impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, pulse, wcycle))
  {
  }
  
@@ -603,15 +590,16 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer<RVec> d_coordinatesBuffer, DeviceB
      impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
  }
  
-void GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
-                                                 GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix          box,
+                                                                  GpuEventSynchronizer* dependencyEvent)
  {
-    impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+    return impl_->communicateHaloCoordinates(box, dependencyEvent);
  }
  
-void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
+void GpuHaloExchange::communicateHaloForces(bool accumulateForces,
+                                            FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
  {
-    impl_->communicateHaloForces(accumulateForces);
+    impl_->communicateHaloForces(accumulateForces, dependencyEvents);
  }
  
  GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index 6190a56b9584d0b575a78cb5a14931b265680108..c29834616a5ab4153aff0cf501467311ea8c7ee4 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -75,8 +75,6 @@ public:
       * \param [in]    dimIndex                 the dimension index for this instance
       * \param [in]    mpi_comm_mysim           communicator used for simulation
       * \param [in]    deviceContext            GPU device context
-     * \param [in]    localStream              local NB CUDA stream
-     * \param [in]    nonLocalStream           non-local NB CUDA stream
       * \param [in]    pulse                    the communication pulse for this instance
       * \param [in]    wcycle                   The wallclock counter
       */
@@ -84,8 +82,6 @@ public:
           int                  dimIndex,
           MPI_Comm             mpi_comm_mysim,
           const DeviceContext& deviceContext,
-         const DeviceStream&  localStream,
-         const DeviceStream&  nonLocalStream,
           int                  pulse,
           gmx_wallcycle*       wcycle);
      ~Impl();
@@ -101,14 +97,17 @@ public:
      /*! \brief
       * GPU halo exchange of coordinates buffer
       * \param [in] box  Coordinate box (from which shifts will be constructed)
-     * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+     * \param [in] dependencyEvent   Dependency event for this operation
+     * \returns                      Event recorded when this operation has been launched
       */
-    void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+    GpuEventSynchronizer* communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* dependencyEvent);
  
      /*! \brief  GPU halo exchange of force buffer
-     * \param[in] accumulateForces  True if forces should accumulate, otherwise they are set
+     * \param [in] accumulateForces  True if forces should accumulate, otherwise they are set
+     * \param [in] dependencyEvents  Dependency events for this operation
       */
-    void communicateHaloForces(bool accumulateForces);
+    void communicateHaloForces(bool                                           accumulateForces,
+                               FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
  
      /*! \brief Get the event synchronizer for the forces ready on device.
       *  \returns  The event to synchronize the stream that consumes forces on device.
@@ -150,8 +149,8 @@ private:
                                          int     recvSize,
                                          int     recvRank);
  
-    /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in non-local
-     * stream \param [in] eventSync    event recorded when coordinates/forces are ready to device
+    /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in halo stream
+     * \param [in] eventSync    event recorded when coordinates/forces are ready to device
       */
      void enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
  
@@ -211,10 +210,8 @@ private:
      MPI_Comm mpi_comm_mysim_;
      //! GPU context object
      const DeviceContext& deviceContext_;
-    //! CUDA stream for local non-bonded calculations
-    const DeviceStream& localStream_;
-    //! CUDA stream for non-local non-bonded calculations
-    const DeviceStream& nonLocalStream_;
+    //! CUDA stream for this halo exchange
+    DeviceStream* haloStream_;
      //! full coordinates buffer in GPU memory
      float3* d_x_ = nullptr;
      //! full forces buffer in GPU memory
@@ -229,6 +226,8 @@ private:
      gmx_wallcycle* wcycle_ = nullptr;
      //! The atom offset for receive (x) or send (f) for dimension index and pulse corresponding to this halo exchange instance
      int atomOffset_ = 0;
+    //! Event triggered when coordinate halo has been launched
+    GpuEventSynchronizer coordinateHaloLaunched_;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/domdec/tests/haloexchange_mpi.cpp b/src/gromacs/domdec/tests/haloexchange_mpi.cpp

index 1f1138fb36d5e41f910df13e0fa3fb63a43058d9..63659998efe79bd3fb26e9ea20cf628a09623584 100644 (file)
--- a/src/gromacs/domdec/tests/haloexchange_mpi.cpp
+++ b/src/gromacs/domdec/tests/haloexchange_mpi.cpp
@@ -150,8 +150,8 @@ void gpuHalo(gmx_domdec_t* dd, matrix box, HostVector<RVec>* h_x, int numAtomsTo
      {
          for (int pulse = 0; pulse < dd->comm->cd[d].numPulses(); pulse++)
          {
-            gpuHaloExchange[d].push_back(GpuHaloExchange(
-                    dd, d, MPI_COMM_WORLD, deviceContext, deviceStream, deviceStream, pulse, nullptr));
+            gpuHaloExchange[d].push_back(
+                    GpuHaloExchange(dd, d, MPI_COMM_WORLD, deviceContext, pulse, nullptr));
          }
      }
  
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index ee3e8cdd46fb086e0db83a7fe9ce9ee3dd2a49ba..241cc3b9fc3036f0267a7439981ce462c65aa328 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1132,12 +1132,13 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
      const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
                              || runScheduleWork->simulationWork.havePpDomainDecomposition;
      const int atomStart = 0;
-    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
-                                                            nbv->getNumAtoms(AtomLocality::Local),
-                                                            nbv->getGridIndices(),
-                                                            atomStart,
-                                                            accumulate,
-                                                            stateGpu->fReducedOnDevice());
+    fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
+            stateGpu->getForces(),
+            nbv->getNumAtoms(AtomLocality::Local),
+            nbv->getGridIndices(),
+            atomStart,
+            accumulate,
+            stateGpu->fReducedOnDevice(AtomLocality::Local));
  
      // register forces and add dependencies
      fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
@@ -1174,15 +1175,12 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
          }
      }
  
-    if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
+    if (runScheduleWork->domainWork.haveCpuLocalForceWork
+        || (runScheduleWork->simulationWork.havePpDomainDecomposition
+            && !runScheduleWork->simulationWork.useGpuHaloExchange))
      {
-        // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
-        if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
-        {
-            const bool useGpuForceBufferOps = true;
-            fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
-                    stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::All, useGpuForceBufferOps));
-        }
+        fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+                stateGpu->fReadyOnDevice(AtomLocality::Local));
      }
  
      if (runScheduleWork->simulationWork.useGpuHaloExchange)
@@ -1197,16 +1195,23 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
          const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
                                  || runScheduleWork->domainWork.haveFreeEnergyWork;
          const int atomStart = dd_numHomeAtoms(*cr->dd);
-        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(stateGpu->getForces(),
-                                                                   nbv->getNumAtoms(AtomLocality::NonLocal),
-                                                                   nbv->getGridIndices(),
-                                                                   atomStart,
-                                                                   accumulate);
+        fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
+                stateGpu->getForces(),
+                nbv->getNumAtoms(AtomLocality::NonLocal),
+                nbv->getGridIndices(),
+                atomStart,
+                accumulate,
+                stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
  
          // register forces and add dependencies
-        // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
          fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(
                  Nbnxm::gpu_get_f(nbv->gpu_nbv));
+
+        if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
+        {
+            fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
+                    stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
+        }
      }
  }
  
@@ -1266,6 +1271,17 @@ void do_force(FILE*                               fplog,
      runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork);
      const StepWorkload& stepWork = runScheduleWork->stepWork;
  
+    if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork)
+    {
+        // GPU Force halo exchange will set a subset of local atoms with remote non-local data
+        // First clear local portion of force array, so that untouched atoms are zero.
+        // The dependency for this is that forces from previous timestep have been consumed,
+        // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked.
+        stateGpu->clearForcesOnGpu(AtomLocality::Local,
+                                   stateGpu->getCoordinatesReadyOnDeviceEvent(
+                                           AtomLocality::Local, simulationWork, stepWork));
+    }
+
      /* At a search step we need to start the first balancing region
       * somewhere early inside the step after communication during domain
       * decomposition (and not during the previous step as usual).
@@ -1579,16 +1595,18 @@ void do_force(FILE*                               fplog,
          }
          else
          {
+            GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
              if (stepWork.useGpuXHalo)
              {
                  // The following must be called after local setCoordinates (which records an event
                  // when the coordinate data has been copied to the device).
-                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
+                gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
  
                  if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                  {
                      // non-local part of coordinate buffer must be copied back to host for CPU work
-                    stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
+                    stateGpu->copyCoordinatesFromGpu(
+                            x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
                  }
              }
              else
@@ -1608,10 +1626,11 @@ void do_force(FILE*                               fplog,
                  {
                      stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
                  }
-                nbv->convertCoordinatesGpu(AtomLocality::NonLocal,
-                                           stateGpu->getCoordinates(),
-                                           stateGpu->getCoordinatesReadyOnDeviceEvent(
-                                                   AtomLocality::NonLocal, simulationWork, stepWork));
+                nbv->convertCoordinatesGpu(
+                        AtomLocality::NonLocal,
+                        stateGpu->getCoordinates(),
+                        stateGpu->getCoordinatesReadyOnDeviceEvent(
+                                AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched));
              }
              else
              {
@@ -2085,13 +2104,11 @@ void do_force(FILE*                               fplog,
              {
                  // If there exist CPU forces, data from halo exchange should accumulate into these
                  bool accumulateForces = domainWork.haveCpuLocalForceWork;
-                if (!accumulateForces)
-                {
-                    // Force halo exchange will set a subset of local atoms with remote non-local data
-                    // First clear local portion of force array, so that untouched atoms are zero
-                    stateGpu->clearForcesOnGpu(AtomLocality::Local);
-                }
-                communicateGpuHaloForces(*cr, accumulateForces);
+                gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
+                gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
+                gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
+
+                communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
              }
              else
              {
@@ -2232,15 +2249,7 @@ void do_force(FILE*                               fplog,
              //   These should be unified.
              if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
              {
-                // Note: AtomLocality::All is used for the non-DD case because, as in this
-                // case copyForcesToGpu() uses a separate stream, it allows overlap of
-                // CPU force H2D with GPU force tasks on all streams including those in the
-                // local stream which would otherwise be implicit dependencies for the
-                // transfer and would not overlap.
-                auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
-                                                                         : AtomLocality::All;
-
-                stateGpu->copyForcesToGpu(forceWithShift, locality);
+                stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
              }
  
              if (stepWork.computeNonbondedForces)
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 970bcd6781830a386e2e6e14424c7674b8bee7b9..85c29398e012f82c5ae52164123238d7b92137ea 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -1522,18 +1522,17 @@ void gmx::LegacySimulator::do_md()
                           && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
  
                  // This applies Leap-Frog, LINCS and SETTLE in succession
-                integrator->integrate(
-                        stateGpu->getForcesReadyOnDeviceEvent(
-                                AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
-                        ir->delta_t,
-                        true,
-                        bCalcVir,
-                        shake_vir,
-                        doTemperatureScaling,
-                        ekind->tcstat,
-                        doParrinelloRahman,
-                        ir->nstpcouple * ir->delta_t,
-                        M);
+                integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
+                                              runScheduleWork->stepWork, runScheduleWork->simulationWork),
+                                      ir->delta_t,
+                                      true,
+                                      bCalcVir,
+                                      shake_vir,
+                                      doTemperatureScaling,
+                                      ekind->tcstat,
+                                      doParrinelloRahman,
+                                      ir->nstpcouple * ir->delta_t,
+                                      M);
  
                  // Copy velocities D2H after update if:
                  // - Globals are computed this step (includes the energy output steps).
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h

index 9e8dc576a6185fbef5f0830c346dac29a604f4e5..ad07df2c132236c3763e2b7ce22d14f76506e454 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h
@@ -188,15 +188,17 @@ public:
       * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
       * returned synchronizer indicates that the copy is complete.
       *
-     *  \param[in] atomLocality    Locality of the particles to wait for.
-     *  \param[in] simulationWork  The simulation lifetime flags.
-     *  \param[in] stepWork        The step lifetime flags.
+     *  \param[in] atomLocality              Locality of the particles to wait for.
+     *  \param[in] simulationWork            The simulation lifetime flags.
+     *  \param[in] stepWork                  The step lifetime flags.
+     *  \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
       *
       *  \returns  The event to synchronize the stream that consumes coordinates on device.
       */
      GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
                                                             const SimulationWorkload& simulationWork,
-                                                           const StepWorkload&       stepWork);
+                                                           const StepWorkload&       stepWork,
+                                                           GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
  
      /*! \brief Blocking wait until coordinates are copied to the device.
       *
@@ -212,12 +214,15 @@ public:
       */
      void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
  
-    /*! \brief Copy positions from the GPU memory.
+    /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
       *
       *  \param[in] h_x           Positions buffer in the host memory.
       *  \param[in] atomLocality  Locality of the particles to copy.
+     *  \param[in] dependency    Dependency event for this operation.
       */
-    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality);
+    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+                                AtomLocality             atomLocality,
+                                GpuEventSynchronizer*    dependency = nullptr);
  
      /*! \brief Wait until coordinates are available on the host.
       *
@@ -271,8 +276,9 @@ public:
      /*! \brief Clear forces in the GPU memory.
       *
       *  \param[in] atomLocality  Locality of the particles to clear.
+     *  \param[in] dependency    Dependency event for this operation.
       */
-    void clearForcesOnGpu(AtomLocality atomLocality);
+    void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
  
      /*! \brief Get the event synchronizer for the forces ready on device.
       *
@@ -281,20 +287,27 @@ public:
       *  1. The forces are copied to the device (when GPU buffer ops are off)
       *  2. The forces are reduced on the device (GPU buffer ops are on)
       *
-     *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
-     *
-     *  \param[in] atomLocality      Locality of the particles to wait for.
-     *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
+     *  \param[in] stepWork        Step workload flags
+     *  \param[in] simulationWork  Simulation workload flags
       *
       *  \returns  The event to synchronize the stream that consumes forces on device.
       */
-    GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps);
+    GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload       stepWork,
+                                                           SimulationWorkload simulationWork);
  
      /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
       *
-     *  \returns  The event to mark when forces are reduced on the GPU.
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are reduced on the GPU.
+     */
+    GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+    /*! \brief Getter for the event synchronizer for the forces are ready on the GPU.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are ready on the GPU.
       */
-    GpuEventSynchronizer* fReducedOnDevice();
+    GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
  
      /*! \brief Copy forces from the GPU memory.
       *
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp

index d4fc4887209c1048df40078c9bad77f5a774c5f0..618856d0ce1a6fb650ce9cc5cdcd2c6f18cd9894 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp
@@ -103,7 +103,8 @@ DeviceBuffer<RVec> StatePropagatorDataGpu::getCoordinates()
  GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(
          AtomLocality /* atomLocality */,
          const SimulationWorkload& /* simulationWork */,
-        const StepWorkload& /* stepWork       */)
+        const StepWorkload& /* stepWork       */,
+        GpuEventSynchronizer* /* gpuCoordinateHaloLaunched */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
@@ -141,14 +142,14 @@ void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality /* atomLoca
  }
  
  void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_x          */,
-                                                    AtomLocality /* atomLocality */)
+                                                    AtomLocality /* atomLocality */,
+                                                    GpuEventSynchronizer* /*dependency */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
                 "GPU implementation.");
  }
  
-
  DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
  {
      GMX_ASSERT(!impl_,
@@ -197,15 +198,24 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
                 "GPU implementation.");
  }
  
-void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */)
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */,
+                                              GpuEventSynchronizer* /* dependency */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub method from GPU state propagator data was called instead of one from "
+               "GPU implementation.");
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::getLocalForcesReadyOnDeviceEvent(StepWorkload /* stepWork */,
+                                                                               SimulationWorkload /* simulationWork */)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
                 "GPU implementation.");
+    return nullptr;
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality /* atomLocality */,
-                                                                          bool /* useGpuFBufferOps */)
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality /*atomLocality*/)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
@@ -213,7 +223,7 @@ GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLo
      return nullptr;
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality /*atomLocality*/)
  {
      GMX_ASSERT(!impl_,
                 "A CPU stub method from GPU state propagator data was called instead of one from "
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h

index 4353eecf4ecf278d25ef632b8230b8cd45b03715..dbe435d508ddd13c54aeec0eeb7c21d26cae7784 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.h
@@ -175,12 +175,14 @@ public:
       *  \param[in] atomLocality    Locality of the particles to wait for.
       *  \param[in] simulationWork  The simulation lifetime flags.
       *  \param[in] stepWork        The step lifetime flags.
+     *  \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
       *
       *  \returns  The event to synchronize the stream that consumes coordinates on device.
       */
      GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
                                                             const SimulationWorkload& simulationWork,
-                                                           const StepWorkload&       stepWork);
+                                                           const StepWorkload&       stepWork,
+                                                           GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
  
      /*! \brief Blocking wait until coordinates are copied to the device.
       *
@@ -196,12 +198,15 @@ public:
       */
      void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
  
-    /*! \brief Copy positions from the GPU memory.
+    /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
       *
       *  \param[in] h_x           Positions buffer in the host memory.
       *  \param[in] atomLocality  Locality of the particles to copy.
+     *  \param[in] dependency    Dependency event for this operation.
       */
-    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality);
+    void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+                                AtomLocality             atomLocality,
+                                GpuEventSynchronizer*    dependency = nullptr);
  
      /*! \brief Wait until coordinates are available on the host.
       *
@@ -253,8 +258,9 @@ public:
      /*! \brief Clear forces in the GPU memory.
       *
       *  \param[in] atomLocality  Locality of the particles to clear.
+     *  \param[in] dependency    Dependency event for this operation.
       */
-    void clearForcesOnGpu(AtomLocality atomLocality);
+    void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
  
      /*! \brief Get the event synchronizer for the forces ready on device.
       *
@@ -263,20 +269,27 @@ public:
       *  1. The forces are copied to the device (when GPU buffer ops are off)
       *  2. The forces are reduced on the device (GPU buffer ops are on)
       *
-     *  \todo Pass step workload instead of the useGpuFBufferOps boolean.
-     *
-     *  \param[in] atomLocality      Locality of the particles to wait for.
-     *  \param[in] useGpuFBufferOps  If the force buffer ops are offloaded to the GPU.
+     *  \param[in] stepWork        Step workload flags
+     *  \param[in] simulationWork  Simulation workload flags
       *
       *  \returns  The event to synchronize the stream that consumes forces on device.
       */
-    GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps);
+    GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload       stepWork,
+                                                           SimulationWorkload simulationWork);
  
-    /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
+    /*! \brief Getter for the event synchronizer for when forces are reduced on the GPU.
       *
-     *  \returns  The event to mark when forces are reduced on the GPU.
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are reduced on the GPU.
+     */
+    GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+    /*! \brief Getter for the event synchronizer for the forces are ready for GPU update.
+     *
+     *  \param[in] atomLocality      Locality of the particles to wait for.
+     *  \returns                     The event to mark when forces are ready for GPU update.
       */
-    GpuEventSynchronizer* fReducedOnDevice();
+    GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
  
      /*! \brief Copy forces from the GPU memory.
       *
@@ -327,6 +340,9 @@ private:
      EnumerationArray<AtomLocality, const DeviceStream*> vCopyStreams_ = { { nullptr } };
      // Streams to use for forces H2D and D2H copies (one event for each atom locality)
      EnumerationArray<AtomLocality, const DeviceStream*> fCopyStreams_ = { { nullptr } };
+    // Streams internal to this module
+    std::unique_ptr<DeviceStream> copyInStream_;
+    std::unique_ptr<DeviceStream> memsetStream_;
  
      /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
       *
@@ -343,8 +359,8 @@ private:
  
      //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality)
      EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnDevice_;
-    //! An event that the forces were reduced on the GPU
-    GpuEventSynchronizer fReducedOnDevice_;
+    //! An array of events that indicate the forces were reduced on the GPU (one event for each atom locality)
+    EnumerationArray<AtomLocality, GpuEventSynchronizer> fReducedOnDevice_;
      //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
      EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
  
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp

index aa63099ffa8b3a3f1387a604dccb6638b054671c..30478ef046c20b76599195554696c9f363e9808e 100644 (file)
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -95,6 +95,9 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManage
      fCopyStreams_[AtomLocality::Local]    = localStream_;
      fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
      fCopyStreams_[AtomLocality::All]      = updateStream_;
+
+    copyInStream_ = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::Normal, false);
+    memsetStream_ = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::Normal, false);
  }
  
  StatePropagatorDataGpu::Impl::Impl(const DeviceStream*  pmeStream,
@@ -340,23 +343,28 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
  }
  
-GpuEventSynchronizer*
-StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
-                                                               const SimulationWorkload& simulationWork,
-                                                               const StepWorkload&       stepWork)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(
+        AtomLocality              atomLocality,
+        const SimulationWorkload& simulationWork,
+        const StepWorkload&       stepWork,
+        GpuEventSynchronizer*     gpuCoordinateHaloLaunched)
  {
      // The provider of the coordinates may be different for local atoms. If the update is offloaded
      // and this is not a neighbor search step, then the consumer needs to wait for the update
      // to complete. Otherwise, the coordinates are copied from the host and we need to wait for
-    // the copy event. Non-local coordinates are always provided by the H2D copy.
-    //
-    // TODO: This should be reconsidered to support the halo exchange.
+    // the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
      //
      // In OpenCL no events are used as coordinate sync is not necessary
      if (GMX_GPU_OPENCL)
      {
          return nullptr;
      }
+    if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
+    {
+        GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,
+                   "GPU halo exchange is active but its completion event is null.");
+        return gpuCoordinateHaloLaunched;
+    }
      if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
      {
          GMX_ASSERT(xUpdatedOnDeviceEvent_ != nullptr, "The event synchronizer can not be nullptr.");
@@ -382,7 +390,9 @@ void StatePropagatorDataGpu::Impl::setXUpdatedOnDeviceEvent(GpuEventSynchronizer
      xUpdatedOnDeviceEvent_ = xUpdatedOnDeviceEvent;
  }
  
-void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
+void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+                                                          AtomLocality             atomLocality,
+                                                          GpuEventSynchronizer*    dependency)
  {
      GMX_ASSERT(atomLocality < AtomLocality::All,
                 formatString("Wrong atom locality. Only Local and NonLocal are allowed for "
@@ -393,6 +403,11 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVe
      GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for copying positions with given atom locality.");
  
+    if (dependency != nullptr)
+    {
+        dependency->enqueueWaitEvent(*deviceStream);
+    }
+
      wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
      wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
  
@@ -476,11 +491,13 @@ DeviceBuffer<RVec> StatePropagatorDataGpu::Impl::getForces()
      return d_f_;
  }
  
+// Copy CPU forces to GPU using stream internal to this module to allow overlap
+// with GPU force calculations.
  void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
                                                     AtomLocality atomLocality)
  {
      GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    DeviceStream* deviceStream = copyInStream_.get();
      GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for copying forces with given atom locality.");
  
@@ -494,10 +511,14 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
  }
  
-void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
+void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency)
  {
      GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
-    const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+    DeviceStream* deviceStream = memsetStream_.get();
+
+    GMX_ASSERT(dependency != nullptr, "Dependency is not valid for clearing forces.");
+    dependency->enqueueWaitEvent(*deviceStream);
+
      GMX_ASSERT(deviceStream != nullptr,
                 "No stream is valid for clearing forces with given atom locality.");
  
@@ -506,26 +527,33 @@ void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
  
      clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
  
+    fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
+
      wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
      wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
-                                                                                bool useGpuFBufferOps)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+                                                                                     SimulationWorkload simulationWork)
  {
-    if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
+    if (stepWork.useGpuFBufferOps && !simulationWork.useCpuPmePpCommunication)
      {
-        return &fReducedOnDevice_;
+        return &fReducedOnDevice_[AtomLocality::Local];
      }
      else
      {
-        return &fReadyOnDevice_[atomLocality];
+        return &fReadyOnDevice_[AtomLocality::Local];
      }
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice(AtomLocality atomLocality)
+{
+    return &fReducedOnDevice_[atomLocality];
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality)
  {
-    return &fReducedOnDevice_;
+    return &fReadyOnDevice_[atomLocality];
  }
  
  void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
@@ -617,9 +645,11 @@ void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRef<const gmx:
  GpuEventSynchronizer*
  StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality              atomLocality,
                                                           const SimulationWorkload& simulationWork,
-                                                         const StepWorkload&       stepWork)
+                                                         const StepWorkload&       stepWork,
+                                                         GpuEventSynchronizer* gpuCoordinateHaloLaunched)
  {
-    return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
+    return impl_->getCoordinatesReadyOnDeviceEvent(
+            atomLocality, simulationWork, stepWork, gpuCoordinateHaloLaunched);
  }
  
  void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
@@ -632,9 +662,11 @@ void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpd
      impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent);
  }
  
-void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x, AtomLocality atomLocality)
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec>   h_x,
+                                                    AtomLocality          atomLocality,
+                                                    GpuEventSynchronizer* dependency)
  {
-    return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
+    return impl_->copyCoordinatesFromGpu(h_x, atomLocality, dependency);
  }
  
  void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
@@ -675,20 +707,25 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec
      return impl_->copyForcesToGpu(h_f, atomLocality);
  }
  
-void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality)
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency)
+{
+    return impl_->clearForcesOnGpu(atomLocality, dependency);
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+                                                                               SimulationWorkload simulationWork)
  {
-    return impl_->clearForcesOnGpu(atomLocality);
+    return impl_->getLocalForcesReadyOnDeviceEvent(stepWork, simulationWork);
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
-                                                                          bool useGpuFBufferOps)
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality atomLocality)
  {
-    return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
+    return impl_->fReducedOnDevice(atomLocality);
  }
  
-GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality)
  {
-    return impl_->fReducedOnDevice();
+    return impl_->fReadyOnDevice(atomLocality);
  }
  
  void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f, AtomLocality atomLocality)
author	Alan Gray <alangray3@gmail.com>
	Wed, 6 Oct 2021 09:26:43 +0000 (09:26 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 6 Oct 2021 09:26:43 +0000 (09:26 +0000)
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/domdec/tests/haloexchange_mpi.cpp		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl.h		patch \| blob \| history
src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp		patch \| blob \| history