Multiple pulses for GPU Halo Exchange

author Alan Gray <alangraygerrit@gmail.com>

Fri, 6 Dec 2019 13:19:43 +0000 (05:19 -0800)

committer Artem Zhmurov <zhmurov@gmail.com>

Sat, 15 Feb 2020 16:52:13 +0000 (17:52 +0100)
author Alan Gray <alangraygerrit@gmail.com>
Fri, 6 Dec 2019 13:19:43 +0000 (05:19 -0800)
committer Artem Zhmurov <zhmurov@gmail.com>
Sat, 15 Feb 2020 16:52:13 +0000 (17:52 +0100)
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index c625bdf7f9b792b22425e4d2849c16ea0e9a55d0..13b7a33e2b0b8170538e5f7251cf6d65f639db56 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -2949,17 +2949,13 @@ static bool canMake1DAnd1PulseDomainDecomposition(const DDSettings&
      return canMakeDDWith1DAnd1Pulse;
  }
  
-bool is1DAnd1PulseDD(const gmx_domdec_t& dd)
+bool is1D(const gmx_domdec_t& dd)
  {
      const int maxDimensionSize = std::max(dd.numCells[XX], std::max(dd.numCells[YY], dd.numCells[ZZ]));
      const int  productOfDimensionSizes      = dd.numCells[XX] * dd.numCells[YY] * dd.numCells[ZZ];
      const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes);
  
-    const bool hasMax1Pulse =
-            ((isDlbDisabled(dd.comm) && dd.comm->cellsize_limit >= dd.comm->systemInfo.cutoff)
-             || (!isDlbDisabled(dd.comm) && dd.comm->maxpulse == 1));
-
-    return decompositionHasOneDimension && hasMax1Pulse;
+    return decompositionHasOneDimension;
  }
  
  namespace gmx
@@ -3216,3 +3212,60 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
  
      return bCutoffAllowed;
  }
+
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal)
+{
+
+    int gpuHaloExchangeSize = 0;
+    int pulseStart          = 0;
+    if (cr.dd->gpuHaloExchange.empty())
+    {
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendTextFormatted(
+                        "NOTE: Activating the 'GPU halo exchange' feature, enabled "
+                        "by the "
+                        "GMX_GPU_DD_COMMS environment variable.");
+    }
+    else
+    {
+        gpuHaloExchangeSize = static_cast<int>(cr.dd->gpuHaloExchange.size());
+        pulseStart          = gpuHaloExchangeSize - 1;
+    }
+    if (cr.dd->comm->cd[0].numPulses() > gpuHaloExchangeSize)
+    {
+        for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+        {
+            cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
+                    cr.dd, cr.mpi_comm_mysim, streamLocal, streamNonLocal, pulse));
+        }
+    }
+}
+
+void reinitGpuHaloExchange(const t_commrec&              cr,
+                           const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
+                           const DeviceBuffer<gmx::RVec> d_forcesBuffer)
+{
+    for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+    {
+        cr.dd->gpuHaloExchange[pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
+    }
+}
+
+void communicateGpuHaloCoordinates(const t_commrec&      cr,
+                                   const matrix          box,
+                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+    for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+    {
+        cr.dd->gpuHaloExchange[pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+    }
+}
+
+void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
+{
+    for (int pulse = cr.dd->comm->cd[0].numPulses() - 1; pulse >= 0; pulse--)
+    {
+        cr.dd->gpuHaloExchange[pulse]->communicateHaloForces(accumulateForces);
+    }
+}
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h

index 439e86b5e7fb0fb608c2061894eb1866dcf50033..1f3fad44051cf9d9622bf60106e939720cb99346 100644 (file)
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -62,6 +62,7 @@
  
  #include <vector>
  
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
  #include "gromacs/math/vectypes.h"
  #include "gromacs/utility/arrayref.h"
  #include "gromacs/utility/basedefinitions.h"
@@ -84,6 +85,7 @@ struct t_nrnb;
  struct gmx_wallcycle;
  enum class PbcType : int;
  class t_state;
+class GpuEventSynchronizer;
  
  namespace gmx
  {
@@ -152,12 +154,12 @@ bool ddHaveSplitConstraints(const gmx_domdec_t& dd);
  /*! \brief Return whether update groups are used */
  bool ddUsesUpdateGroups(const gmx_domdec_t& dd);
  
-/*! \brief Return whether the DD has a single dimension with a single pulse
+/*! \brief Return whether the DD has a single dimension
   *
- * The GPU halo exchange code requires a 1D single-pulse DD, and its
- * setup code can use the returned value to understand what it should
- * do. */
-bool is1DAnd1PulseDD(const gmx_domdec_t& dd);
+ * The GPU halo exchange code requires a 1D DD, and its setup code can
+ * use the returned value to understand what it should do.
+ */
+bool is1D(const gmx_domdec_t& dd);
  
  /*! \brief Initialize data structures for bonded interactions */
  void dd_init_bondeds(FILE*                      fplog,
@@ -316,4 +318,39 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog,
                             real*                r_2b,
                             real*                r_mb);
  
+/*! \brief Construct the GPU halo exchange object(s)
+ * \param[in] mdlog          The logger object
+ * \param[in] cr             The commrec object
+ * \param[in] streamLocal    The local GPU stream
+ * \param[in] streamNonLocal The non-local GPU stream
+ */
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal);
+
+/*! \brief
+ * (Re-) Initialization for GPU halo exchange
+ * \param [in] cr                   The commrec object
+ * \param [in] d_coordinatesBuffer  pointer to coordinates buffer in GPU memory
+ * \param [in] d_forcesBuffer       pointer to forces buffer in GPU memory
+ */
+void reinitGpuHaloExchange(const t_commrec&        cr,
+                           DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
+                           DeviceBuffer<gmx::RVec> d_forcesBuffer);
+
+
+/*! \brief GPU halo exchange of coordinates buffer.
+ * \param [in] cr                             The commrec object
+ * \param [in] box                            Coordinate box (from which shifts will be constructed)
+ * \param [in] coordinatesReadyOnDeviceEvent  event recorded when coordinates have been copied to device
+ */
+void communicateGpuHaloCoordinates(const t_commrec&      cr,
+                                   const matrix          box,
+                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+
+/*! \brief GPU halo exchange of force buffer.
+ * \param [in] cr                The commrec object
+ * \param [in] accumulateForces  True if forces should accumulate, otherwise they are set
+ */
+void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces);
+
  #endif
diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h

index 12a134c96c3b6ecf022e31506980ce38c81b2e60..8670ca81c6fc246c4ee9c84f5ca36a103baa63fd 100644 (file)
--- a/src/gromacs/domdec/domdec_struct.h
+++ b/src/gromacs/domdec/domdec_struct.h
@@ -3,7 +3,7 @@
   *
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
   * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2018,2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -237,7 +237,7 @@ struct gmx_domdec_t
      std::vector<gmx::RVec> pmeForceReceiveBuffer;
  
      /* GPU halo exchange object */
-    std::unique_ptr<gmx::GpuHaloExchange> gpuHaloExchange;
+    std::vector<std::unique_ptr<gmx::GpuHaloExchange>> gpuHaloExchange;
  };
  
  //! Are we the master node for domain decomposition
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index d32b1800c4568bc32c3bcae91c8e244066e0b1ed..dc65cb93d3e9d6fb66f0209e9208235898672cca 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -82,8 +82,9 @@ public:
       * \param [in]    mpi_comm_mysim           communicator used for simulation
       * \param [in]    streamLocal              local NB CUDA stream.
       * \param [in]    streamNonLocal           non-local NB CUDA stream.
+     * \param [in]    pulse                    the communication pulse for this instance
       */
-    GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal);
+    GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal, int pulse);
      ~GpuHaloExchange();
  
      /*! \brief
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index a17c550c6c9044e77bad079c615a1e12e5ac7f39..1ce9a9d93e7490421c58feb8ef32b65a471692d3 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -63,7 +63,8 @@ class GpuHaloExchange::Impl
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                   MPI_Comm /* mpi_comm_mysim */,
                                   void* /*streamLocal */,
-                                 void* /*streamNonLocal */) :
+                                 void* /*streamNonLocal */,
+                                 int /*pulse */) :
      impl_(nullptr)
  {
      GMX_ASSERT(false,
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index 4313ffacb086e2109f68a735788bfa75b4ebe429..9c5aa5b8fe392cca6adc036d3e4620ef64c5ad37 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -134,14 +134,11 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
      d_f_ = d_forcesBuffer;
  
      cudaStream_t                 stream  = nonLocalStream_;
-    int                          nzone   = 1;
      const gmx_domdec_comm_t&     comm    = *dd_->comm;
      const gmx_domdec_comm_dim_t& cd      = comm.cd[0];
-    const gmx_domdec_ind_t&      ind     = cd.ind[0];
-    int                          newSize = ind.nsend[nzone + 1];
+    const gmx_domdec_ind_t&      ind     = cd.ind[pulse_];
+    int                          newSize = ind.nsend[nzone_ + 1];
  
-    GMX_RELEASE_ASSERT(cd.numPulses() == 1,
-                       "Multiple pulses are not yet supported in GPU halo exchange");
      GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
  
      // reallocates only if needed
@@ -178,7 +175,13 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
  
      // Coordinates buffer:
  #if GMX_MPI
-    void* recvPtr = static_cast<void*>(&d_coordinatesBuffer[numHomeAtoms_]);
+    int pulseOffset = 0;
+    for (int p = pulse_ - 1; p >= 0; p--)
+    {
+        pulseOffset += cd.ind[p].nrecv[nzone_ + 1];
+    }
+    //    void* recvPtr = static_cast<void*>(&d_coordinatesBuffer[numHomeAtoms_ + pulseOffset]);
+    void* recvPtr = static_cast<void*>(&d_x_[numHomeAtoms_ + pulseOffset]);
      MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, &remoteXPtr_, sizeof(void*),
                   MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
  
@@ -188,7 +191,6 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
                   MPI_BYTE, sendRankF_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
  #endif
  
-
      return;
  }
  
@@ -196,8 +198,11 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
                                                         GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
  {
  
-    // ensure stream waits until coordinate data is available on device
-    coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+    if (pulse_ == 0)
+    {
+        // ensure stream waits until coordinate data is available on device
+        coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+    }
  
      // launch kernel to pack send buffer
      KernelLaunchConfig config;
@@ -252,19 +257,22 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
  
      float3* d_f = d_f_;
  
-    if (!accumulateForces)
+    if (pulse_ == (dd_->comm->cd[0].numPulses() - 1))
      {
-        // Clear local portion of force array (in local stream)
-        cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
-    }
+        if (!accumulateForces)
+        {
+            // Clear local portion of force array (in local stream)
+            cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
+        }
  
-    // ensure non-local stream waits for local stream, due to dependence on
-    // the previous H2D copy of CPU forces (if accumulateForces is true)
-    // or the above clearing.
-    // TODO remove this dependency on localStream - edmine issue #3093
-    GpuEventSynchronizer eventLocal;
-    eventLocal.markEvent(localStream_);
-    eventLocal.enqueueWaitEvent(nonLocalStream_);
+        // ensure non-local stream waits for local stream, due to dependence on
+        // the previous H2D copy of CPU forces (if accumulateForces is true)
+        // or the above clearing.
+        // TODO remove this dependency on localStream - edmine issue #3093
+        GpuEventSynchronizer eventLocal;
+        eventLocal.markEvent(localStream_);
+        eventLocal.enqueueWaitEvent(nonLocalStream_);
+    }
  
      // Unpack halo buffer into force array
  
@@ -282,6 +290,14 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
      const int*    indexMap = d_indexMap_;
      const int     size     = fRecvSize_;
  
+    if (pulse_ > 0)
+    {
+        // We need to accumulate rather than set, since it is possible
+        // that, in this pulse, a value could be written to a location
+        // corresponding to the halo region of a following pulse.
+        accumulateForces = true;
+    }
+
      if (size > 0)
      {
          auto kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
@@ -291,7 +307,11 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
  
          launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
      }
-    fReadyOnDevice_.markEvent(nonLocalStream_);
+
+    if (pulse_ == 0)
+    {
+        fReadyOnDevice_.markEvent(nonLocalStream_);
+    }
  }
  
  
@@ -329,7 +349,12 @@ void GpuHaloExchange::Impl::communicateHaloData(float3*               d_ptr,
      }
      else
      {
-        sendPtr   = static_cast<void*>(&(d_ptr[numHomeAtoms_]));
+        int recvOffset = dd_->comm->atomRanges.end(DDAtomRanges::Type::Zones);
+        for (int p = pulse_; p < dd_->comm->cd[0].numPulses(); p++)
+        {
+            recvOffset -= dd_->comm->cd[0].ind[p].nrecv[nzone_ + 1];
+        }
+        sendPtr   = static_cast<void*>(&(d_ptr[recvOffset]));
          sendSize  = fSendSize_;
          remotePtr = remoteFPtr_;
          sendRank  = sendRankF_;
@@ -389,7 +414,11 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
  }
  
  /*! \brief Create Domdec GPU object */
-GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
+GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
+                            MPI_Comm      mpi_comm_mysim,
+                            void*         localStream,
+                            void*         nonLocalStream,
+                            int           pulse) :
      dd_(dd),
      sendRankX_(dd->neighbor[0][1]),
      recvRankX_(dd->neighbor[0][0]),
@@ -399,7 +428,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* loc
      haloDataTransferLaunched_(new GpuEventSynchronizer()),
      mpi_comm_mysim_(mpi_comm_mysim),
      localStream_(*static_cast<cudaStream_t*>(localStream)),
-    nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream))
+    nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream)),
+    pulse_(pulse)
  {
  
      GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
@@ -429,8 +459,12 @@ GpuHaloExchange::Impl::~Impl()
      delete haloDataTransferLaunched_;
  }
  
-GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
-    impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream))
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd,
+                                 MPI_Comm      mpi_comm_mysim,
+                                 void*         localStream,
+                                 void*         nonLocalStream,
+                                 int           pulse) :
+    impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, pulse))
  {
  }
  
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index 017cb191869e40bc9d5b46cb1c9f164a0767e6a5..b139a9b491e2704b927a8663af948c4b843fe6dd 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -72,8 +72,9 @@ public:
       * \param [in]    mpi_comm_mysim           communicator used for simulation
       * \param [in]    localStream              local NB CUDA stream
       * \param [in]    nonLocalStream           non-local NB CUDA stream
+     * \param [in]    pulse                    the communication pulse for this instance
       */
-    Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream);
+    Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream, int pulse);
      ~Impl();
  
      /*! \brief
@@ -184,6 +185,10 @@ private:
      float3* d_f_ = nullptr;
      //! An event recorded once the exchanged forces are ready on the GPU
      GpuEventSynchronizer fReadyOnDevice_;
+    //! The pulse corresponding to this halo exchange instance
+    int pulse_ = 0;
+    //! Number of zones. Always 1 for 1-D case.
+    const int nzone_ = 1;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 07644556aaf76f22885166f8bafb2fccc4db81ae..61878b8f5af58b95b8d7c6b1c05183d001ced540 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1001,6 +1001,7 @@ void do_force(FILE*                               fplog,
      {
          if (stepWork.doNeighborSearch)
          {
+            // TODO refactor this to do_md, after partitioning.
              stateGpu->reinit(mdatoms->homenr,
                               cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
              if (useGpuPmeOnThisRank)
@@ -1023,9 +1024,8 @@ void do_force(FILE*                               fplog,
      // The conditions for gpuHaloExchange e.g. using GPU buffer
      // operations were checked before construction, so here we can
      // just use it and assert upon any conditions.
-    gmx::GpuHaloExchange* gpuHaloExchange =
-            (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
-    const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
+    const bool ddUsesGpuDirectCommunication =
+            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty()));
      GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
                 "Must use coordinate buffer ops with GPU halo exchange");
      const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
@@ -1259,9 +1259,13 @@ void do_force(FILE*                               fplog,
              nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
              wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
              wallcycle_stop(wcycle, ewcNS);
+            // TODO refactor this GPU halo exchange re-initialisation
+            // to location in do_md where GPU halo exchange is
+            // constructed at partitioning, after above stateGpu
+            // re-initialization has similarly been refactored
              if (ddUsesGpuDirectCommunication)
              {
-                gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
+                reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
              }
          }
          else
@@ -1270,7 +1274,7 @@ void do_force(FILE*                               fplog,
              {
                  // The following must be called after local setCoordinates (which records an event
                  // when the coordinate data has been copied to the device).
-                gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
+                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
  
                  if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                  {
@@ -1590,7 +1594,7 @@ void do_force(FILE*                               fplog,
                  {
                      stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
                  }
-                gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork);
+                communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
              }
              else
              {
@@ -1731,7 +1735,7 @@ void do_force(FILE*                               fplog,
              }
              if (useGpuForcesHaloExchange)
              {
-                dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
+                dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent());
              }
              nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                                dependencyList, stepWork.useGpuPmeFReduction,
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index b86fc126869de45bdc72bbf8abda2e13e0ffff69..5314e3ac9c39e9423ce4689c63eba32cbf4153e8 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -58,6 +58,7 @@
  #include "gromacs/domdec/domdec.h"
  #include "gromacs/domdec/domdec_network.h"
  #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
  #include "gromacs/domdec/mdsetup.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/essentialdynamics/edsam.h"
@@ -843,6 +844,18 @@ void gmx::LegacySimulator::do_md()
                                      fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
                  shouldCheckNumberOfBondedInteractions = true;
                  upd.setNumAtoms(state->natoms);
+
+                // Allocate or re-size GPU halo exchange object, if necessary
+                if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
+                    && useGpuForNonbonded && is1D(*cr->dd))
+                {
+                    // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
+                    void* streamLocal =
+                            Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
+                    void* streamNonLocal = Nbnxm::gpu_get_command_stream(
+                            fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
+                    constructGpuHaloExchange(mdlog, *cr, streamLocal, streamNonLocal);
+                }
              }
          }
  
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 3debb2046e5b12ad3e9f92874b44694033e75474..41f609d1cc3659226b6da206881e5fa6e577f6a9 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -254,7 +254,8 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
              GMX_LOG(mdlog.warning)
                      .asParagraph()
                      .appendTextFormatted(
-                            "This run uses the 'GPU halo exchange' feature, enabled by the "
+                            "This run has requested the 'GPU halo exchange' feature, enabled by "
+                            "the "
                              "GMX_GPU_DD_COMMS environment variable.");
          }
          else
@@ -1358,26 +1359,6 @@ int Mdrunner::mdrunner()
              fr->gpuBonded = gpuBonded.get();
          }
  
-        // TODO Move this to happen during domain decomposition setup,
-        // once stream and event handling works well with that.
-        // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-        if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
-        {
-            GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps,
-                               "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
-            void* streamLocal =
-                    Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
-            void* streamNonLocal =
-                    Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
-                            "GMX_GPU_DD_COMMS environment variable.");
-            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(
-                    cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal);
-        }
-
          /* Initialize the mdAtoms structure.
           * mdAtoms is not filled with atom data,
           * as this can not be done now with domain decomposition.
author	Alan Gray <alangraygerrit@gmail.com>
	Fri, 6 Dec 2019 13:19:43 +0000 (05:19 -0800)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Sat, 15 Feb 2020 16:52:13 +0000 (17:52 +0100)
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec.h		patch \| blob \| history
src/gromacs/domdec/domdec_struct.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history