From: Alan Gray <alangraygerrit@gmail.com>
Date: Fri, 6 Dec 2019 13:19:43 +0000 (-0800)
Subject: Multiple pulses for GPU Halo Exchange
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=fdf8c9064e2289bb01b1536a06f83755450caee8;p=alexxy%2Fgromacs.git

Multiple pulses for GPU Halo Exchange

Removes restriction on single pulse.

Implements #3106

Change-Id: I5d68258de831d04c14d6c352fc52e51852fccd80
---

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index c625bdf7f9..13b7a33e2b 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -2949,17 +2949,13 @@ static bool canMake1DAnd1PulseDomainDecomposition(const DDSettings&
     return canMakeDDWith1DAnd1Pulse;
 }
 
-bool is1DAnd1PulseDD(const gmx_domdec_t& dd)
+bool is1D(const gmx_domdec_t& dd)
 {
     const int maxDimensionSize = std::max(dd.numCells[XX], std::max(dd.numCells[YY], dd.numCells[ZZ]));
     const int  productOfDimensionSizes      = dd.numCells[XX] * dd.numCells[YY] * dd.numCells[ZZ];
     const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes);
 
-    const bool hasMax1Pulse =
-            ((isDlbDisabled(dd.comm) && dd.comm->cellsize_limit >= dd.comm->systemInfo.cutoff)
-             || (!isDlbDisabled(dd.comm) && dd.comm->maxpulse == 1));
-
-    return decompositionHasOneDimension && hasMax1Pulse;
+    return decompositionHasOneDimension;
 }
 
 namespace gmx
@@ -3216,3 +3212,60 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRef<const g
 
     return bCutoffAllowed;
 }
+
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal)
+{
+
+    int gpuHaloExchangeSize = 0;
+    int pulseStart          = 0;
+    if (cr.dd->gpuHaloExchange.empty())
+    {
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendTextFormatted(
+                        "NOTE: Activating the 'GPU halo exchange' feature, enabled "
+                        "by the "
+                        "GMX_GPU_DD_COMMS environment variable.");
+    }
+    else
+    {
+        gpuHaloExchangeSize = static_cast<int>(cr.dd->gpuHaloExchange.size());
+        pulseStart          = gpuHaloExchangeSize - 1;
+    }
+    if (cr.dd->comm->cd[0].numPulses() > gpuHaloExchangeSize)
+    {
+        for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+        {
+            cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
+                    cr.dd, cr.mpi_comm_mysim, streamLocal, streamNonLocal, pulse));
+        }
+    }
+}
+
+void reinitGpuHaloExchange(const t_commrec&              cr,
+                           const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
+                           const DeviceBuffer<gmx::RVec> d_forcesBuffer)
+{
+    for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+    {
+        cr.dd->gpuHaloExchange[pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
+    }
+}
+
+void communicateGpuHaloCoordinates(const t_commrec&      cr,
+                                   const matrix          box,
+                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+    for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+    {
+        cr.dd->gpuHaloExchange[pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+    }
+}
+
+void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
+{
+    for (int pulse = cr.dd->comm->cd[0].numPulses() - 1; pulse >= 0; pulse--)
+    {
+        cr.dd->gpuHaloExchange[pulse]->communicateHaloForces(accumulateForces);
+    }
+}
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h
index 439e86b5e7..1f3fad4405 100644
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -62,6 +62,7 @@
 
 #include <vector>
 
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/math/vectypes.h"
 #include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/basedefinitions.h"
@@ -84,6 +85,7 @@ struct t_nrnb;
 struct gmx_wallcycle;
 enum class PbcType : int;
 class t_state;
+class GpuEventSynchronizer;
 
 namespace gmx
 {
@@ -152,12 +154,12 @@ bool ddHaveSplitConstraints(const gmx_domdec_t& dd);
 /*! \brief Return whether update groups are used */
 bool ddUsesUpdateGroups(const gmx_domdec_t& dd);
 
-/*! \brief Return whether the DD has a single dimension with a single pulse
+/*! \brief Return whether the DD has a single dimension
  *
- * The GPU halo exchange code requires a 1D single-pulse DD, and its
- * setup code can use the returned value to understand what it should
- * do. */
-bool is1DAnd1PulseDD(const gmx_domdec_t& dd);
+ * The GPU halo exchange code requires a 1D DD, and its setup code can
+ * use the returned value to understand what it should do.
+ */
+bool is1D(const gmx_domdec_t& dd);
 
 /*! \brief Initialize data structures for bonded interactions */
 void dd_init_bondeds(FILE*                      fplog,
@@ -316,4 +318,39 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog,
                            real*                r_2b,
                            real*                r_mb);
 
+/*! \brief Construct the GPU halo exchange object(s)
+ * \param[in] mdlog          The logger object
+ * \param[in] cr             The commrec object
+ * \param[in] streamLocal    The local GPU stream
+ * \param[in] streamNonLocal The non-local GPU stream
+ */
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal);
+
+/*! \brief
+ * (Re-) Initialization for GPU halo exchange
+ * \param [in] cr                   The commrec object
+ * \param [in] d_coordinatesBuffer  pointer to coordinates buffer in GPU memory
+ * \param [in] d_forcesBuffer       pointer to forces buffer in GPU memory
+ */
+void reinitGpuHaloExchange(const t_commrec&        cr,
+                           DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
+                           DeviceBuffer<gmx::RVec> d_forcesBuffer);
+
+
+/*! \brief GPU halo exchange of coordinates buffer.
+ * \param [in] cr                             The commrec object
+ * \param [in] box                            Coordinate box (from which shifts will be constructed)
+ * \param [in] coordinatesReadyOnDeviceEvent  event recorded when coordinates have been copied to device
+ */
+void communicateGpuHaloCoordinates(const t_commrec&      cr,
+                                   const matrix          box,
+                                   GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+
+/*! \brief GPU halo exchange of force buffer.
+ * \param [in] cr                The commrec object
+ * \param [in] accumulateForces  True if forces should accumulate, otherwise they are set
+ */
+void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces);
+
 #endif
diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h
index 12a134c96c..8670ca81c6 100644
--- a/src/gromacs/domdec/domdec_struct.h
+++ b/src/gromacs/domdec/domdec_struct.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2018,2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -237,7 +237,7 @@ struct gmx_domdec_t
     std::vector<gmx::RVec> pmeForceReceiveBuffer;
 
     /* GPU halo exchange object */
-    std::unique_ptr<gmx::GpuHaloExchange> gpuHaloExchange;
+    std::vector<std::unique_ptr<gmx::GpuHaloExchange>> gpuHaloExchange;
 };
 
 //! Are we the master node for domain decomposition
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h
index d32b1800c4..dc65cb93d3 100644
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -82,8 +82,9 @@ public:
      * \param [in]    mpi_comm_mysim           communicator used for simulation
      * \param [in]    streamLocal              local NB CUDA stream.
      * \param [in]    streamNonLocal           non-local NB CUDA stream.
+     * \param [in]    pulse                    the communication pulse for this instance
      */
-    GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal);
+    GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal, int pulse);
     ~GpuHaloExchange();
 
     /*! \brief
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
index a17c550c6c..1ce9a9d93e 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -63,7 +63,8 @@ class GpuHaloExchange::Impl
 GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
                                  MPI_Comm /* mpi_comm_mysim */,
                                  void* /*streamLocal */,
-                                 void* /*streamNonLocal */) :
+                                 void* /*streamNonLocal */,
+                                 int /*pulse */) :
     impl_(nullptr)
 {
     GMX_ASSERT(false,
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index 4313ffacb0..9c5aa5b8fe 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -134,14 +134,11 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
     d_f_ = d_forcesBuffer;
 
     cudaStream_t                 stream  = nonLocalStream_;
-    int                          nzone   = 1;
     const gmx_domdec_comm_t&     comm    = *dd_->comm;
     const gmx_domdec_comm_dim_t& cd      = comm.cd[0];
-    const gmx_domdec_ind_t&      ind     = cd.ind[0];
-    int                          newSize = ind.nsend[nzone + 1];
+    const gmx_domdec_ind_t&      ind     = cd.ind[pulse_];
+    int                          newSize = ind.nsend[nzone_ + 1];
 
-    GMX_RELEASE_ASSERT(cd.numPulses() == 1,
-                       "Multiple pulses are not yet supported in GPU halo exchange");
     GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
 
     // reallocates only if needed
@@ -178,7 +175,13 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
 
     // Coordinates buffer:
 #if GMX_MPI
-    void* recvPtr = static_cast<void*>(&d_coordinatesBuffer[numHomeAtoms_]);
+    int pulseOffset = 0;
+    for (int p = pulse_ - 1; p >= 0; p--)
+    {
+        pulseOffset += cd.ind[p].nrecv[nzone_ + 1];
+    }
+    //    void* recvPtr = static_cast<void*>(&d_coordinatesBuffer[numHomeAtoms_ + pulseOffset]);
+    void* recvPtr = static_cast<void*>(&d_x_[numHomeAtoms_ + pulseOffset]);
     MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, &remoteXPtr_, sizeof(void*),
                  MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
 
@@ -188,7 +191,6 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
                  MPI_BYTE, sendRankF_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
 #endif
 
-
     return;
 }
 
@@ -196,8 +198,11 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
                                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
 
-    // ensure stream waits until coordinate data is available on device
-    coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+    if (pulse_ == 0)
+    {
+        // ensure stream waits until coordinate data is available on device
+        coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+    }
 
     // launch kernel to pack send buffer
     KernelLaunchConfig config;
@@ -252,19 +257,22 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
 
     float3* d_f = d_f_;
 
-    if (!accumulateForces)
+    if (pulse_ == (dd_->comm->cd[0].numPulses() - 1))
     {
-        // Clear local portion of force array (in local stream)
-        cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
-    }
+        if (!accumulateForces)
+        {
+            // Clear local portion of force array (in local stream)
+            cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
+        }
 
-    // ensure non-local stream waits for local stream, due to dependence on
-    // the previous H2D copy of CPU forces (if accumulateForces is true)
-    // or the above clearing.
-    // TODO remove this dependency on localStream - edmine issue #3093
-    GpuEventSynchronizer eventLocal;
-    eventLocal.markEvent(localStream_);
-    eventLocal.enqueueWaitEvent(nonLocalStream_);
+        // ensure non-local stream waits for local stream, due to dependence on
+        // the previous H2D copy of CPU forces (if accumulateForces is true)
+        // or the above clearing.
+        // TODO remove this dependency on localStream - edmine issue #3093
+        GpuEventSynchronizer eventLocal;
+        eventLocal.markEvent(localStream_);
+        eventLocal.enqueueWaitEvent(nonLocalStream_);
+    }
 
     // Unpack halo buffer into force array
 
@@ -282,6 +290,14 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
     const int*    indexMap = d_indexMap_;
     const int     size     = fRecvSize_;
 
+    if (pulse_ > 0)
+    {
+        // We need to accumulate rather than set, since it is possible
+        // that, in this pulse, a value could be written to a location
+        // corresponding to the halo region of a following pulse.
+        accumulateForces = true;
+    }
+
     if (size > 0)
     {
         auto kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
@@ -291,7 +307,11 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
 
         launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
     }
-    fReadyOnDevice_.markEvent(nonLocalStream_);
+
+    if (pulse_ == 0)
+    {
+        fReadyOnDevice_.markEvent(nonLocalStream_);
+    }
 }
 
 
@@ -329,7 +349,12 @@ void GpuHaloExchange::Impl::communicateHaloData(float3*               d_ptr,
     }
     else
     {
-        sendPtr   = static_cast<void*>(&(d_ptr[numHomeAtoms_]));
+        int recvOffset = dd_->comm->atomRanges.end(DDAtomRanges::Type::Zones);
+        for (int p = pulse_; p < dd_->comm->cd[0].numPulses(); p++)
+        {
+            recvOffset -= dd_->comm->cd[0].ind[p].nrecv[nzone_ + 1];
+        }
+        sendPtr   = static_cast<void*>(&(d_ptr[recvOffset]));
         sendSize  = fSendSize_;
         remotePtr = remoteFPtr_;
         sendRank  = sendRankF_;
@@ -389,7 +414,11 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
 }
 
 /*! \brief Create Domdec GPU object */
-GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
+GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
+                            MPI_Comm      mpi_comm_mysim,
+                            void*         localStream,
+                            void*         nonLocalStream,
+                            int           pulse) :
     dd_(dd),
     sendRankX_(dd->neighbor[0][1]),
     recvRankX_(dd->neighbor[0][0]),
@@ -399,7 +428,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* loc
     haloDataTransferLaunched_(new GpuEventSynchronizer()),
     mpi_comm_mysim_(mpi_comm_mysim),
     localStream_(*static_cast<cudaStream_t*>(localStream)),
-    nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream))
+    nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream)),
+    pulse_(pulse)
 {
 
     GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
@@ -429,8 +459,12 @@ GpuHaloExchange::Impl::~Impl()
     delete haloDataTransferLaunched_;
 }
 
-GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
-    impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream))
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd,
+                                 MPI_Comm      mpi_comm_mysim,
+                                 void*         localStream,
+                                 void*         nonLocalStream,
+                                 int           pulse) :
+    impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, pulse))
 {
 }
 
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
index 017cb19186..b139a9b491 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -72,8 +72,9 @@ public:
      * \param [in]    mpi_comm_mysim           communicator used for simulation
      * \param [in]    localStream              local NB CUDA stream
      * \param [in]    nonLocalStream           non-local NB CUDA stream
+     * \param [in]    pulse                    the communication pulse for this instance
      */
-    Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream);
+    Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream, int pulse);
     ~Impl();
 
     /*! \brief
@@ -184,6 +185,10 @@ private:
     float3* d_f_ = nullptr;
     //! An event recorded once the exchanged forces are ready on the GPU
     GpuEventSynchronizer fReadyOnDevice_;
+    //! The pulse corresponding to this halo exchange instance
+    int pulse_ = 0;
+    //! Number of zones. Always 1 for 1-D case.
+    const int nzone_ = 1;
 };
 
 } // namespace gmx
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 07644556aa..61878b8f5a 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1001,6 +1001,7 @@ void do_force(FILE*                               fplog,
     {
         if (stepWork.doNeighborSearch)
         {
+            // TODO refactor this to do_md, after partitioning.
             stateGpu->reinit(mdatoms->homenr,
                              cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
             if (useGpuPmeOnThisRank)
@@ -1023,9 +1024,8 @@ void do_force(FILE*                               fplog,
     // The conditions for gpuHaloExchange e.g. using GPU buffer
     // operations were checked before construction, so here we can
     // just use it and assert upon any conditions.
-    gmx::GpuHaloExchange* gpuHaloExchange =
-            (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
-    const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
+    const bool ddUsesGpuDirectCommunication =
+            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty()));
     GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
                "Must use coordinate buffer ops with GPU halo exchange");
     const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
@@ -1259,9 +1259,13 @@ void do_force(FILE*                               fplog,
             nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
             wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
             wallcycle_stop(wcycle, ewcNS);
+            // TODO refactor this GPU halo exchange re-initialisation
+            // to location in do_md where GPU halo exchange is
+            // constructed at partitioning, after above stateGpu
+            // re-initialization has similarly been refactored
             if (ddUsesGpuDirectCommunication)
             {
-                gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
+                reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
             }
         }
         else
@@ -1270,7 +1274,7 @@ void do_force(FILE*                               fplog,
             {
                 // The following must be called after local setCoordinates (which records an event
                 // when the coordinate data has been copied to the device).
-                gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
+                communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
 
                 if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
                 {
@@ -1590,7 +1594,7 @@ void do_force(FILE*                               fplog,
                 {
                     stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
                 }
-                gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork);
+                communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
             }
             else
             {
@@ -1731,7 +1735,7 @@ void do_force(FILE*                               fplog,
             }
             if (useGpuForcesHaloExchange)
             {
-                dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
+                dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent());
             }
             nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                               dependencyList, stepWork.useGpuPmeFReduction,
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index b86fc12686..5314e3ac9c 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -58,6 +58,7 @@
 #include "gromacs/domdec/domdec.h"
 #include "gromacs/domdec/domdec_network.h"
 #include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
 #include "gromacs/domdec/mdsetup.h"
 #include "gromacs/domdec/partition.h"
 #include "gromacs/essentialdynamics/edsam.h"
@@ -843,6 +844,18 @@ void gmx::LegacySimulator::do_md()
                                     fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
                 shouldCheckNumberOfBondedInteractions = true;
                 upd.setNumAtoms(state->natoms);
+
+                // Allocate or re-size GPU halo exchange object, if necessary
+                if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
+                    && useGpuForNonbonded && is1D(*cr->dd))
+                {
+                    // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
+                    void* streamLocal =
+                            Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
+                    void* streamNonLocal = Nbnxm::gpu_get_command_stream(
+                            fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
+                    constructGpuHaloExchange(mdlog, *cr, streamLocal, streamNonLocal);
+                }
             }
         }
 
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 3debb2046e..41f609d1cc 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -254,7 +254,8 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
             GMX_LOG(mdlog.warning)
                     .asParagraph()
                     .appendTextFormatted(
-                            "This run uses the 'GPU halo exchange' feature, enabled by the "
+                            "This run has requested the 'GPU halo exchange' feature, enabled by "
+                            "the "
                             "GMX_GPU_DD_COMMS environment variable.");
         }
         else
@@ -1358,26 +1359,6 @@ int Mdrunner::mdrunner()
             fr->gpuBonded = gpuBonded.get();
         }
 
-        // TODO Move this to happen during domain decomposition setup,
-        // once stream and event handling works well with that.
-        // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-        if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
-        {
-            GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps,
-                               "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
-            void* streamLocal =
-                    Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
-            void* streamNonLocal =
-                    Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
-                            "GMX_GPU_DD_COMMS environment variable.");
-            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(
-                    cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal);
-        }
-
         /* Initialize the mdAtoms structure.
          * mdAtoms is not filled with atom data,
          * as this can not be done now with domain decomposition.