Remove single-dimension limitation from GPU halo exchange

author Alan Gray <alangray3@gmail.com>

Mon, 28 Sep 2020 20:53:07 +0000 (20:53 +0000)

committer Artem Zhmurov <zhmurov@gmail.com>

Mon, 28 Sep 2020 20:53:07 +0000 (20:53 +0000)
author Alan Gray <alangray3@gmail.com>
Mon, 28 Sep 2020 20:53:07 +0000 (20:53 +0000)
committer Artem Zhmurov <zhmurov@gmail.com>
Mon, 28 Sep 2020 20:53:07 +0000 (20:53 +0000)
diff --git a/src/gromacs/domdec/builder.h b/src/gromacs/domdec/builder.h

index ea31080c627bc71ef338f66a129671dcb079d6c6..01239c8058a5c9c5b2048de02acd7c2aea310f41 100644 (file)
--- a/src/gromacs/domdec/builder.h
+++ b/src/gromacs/domdec/builder.h
@@ -79,7 +79,6 @@ public:
                                 t_commrec*           cr,
                                 const DomdecOptions& options,
                                 const MdrunOptions&  mdrunOptions,
-                               bool                 prefer1D,
                                 const gmx_mtop_t&    mtop,
                                 const t_inputrec&    ir,
                                 const matrix         box,
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index ddb74cbc3d8b22b0fa95bc7f722deee82140fb52..d3ba5aba08ef2f8e997bb988ab5baab865303824 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -2865,7 +2865,6 @@ static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
  
      ddSettings.useSendRecv2        = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0);
      ddSettings.dlb_scale_lim       = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10);
-    ddSettings.request1D           = bool(dd_getenv(mdlog, "GMX_DD_1D", 0));
      ddSettings.useDDOrderZYX       = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0));
      ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1));
      ddSettings.eFlop               = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0);
@@ -2904,63 +2903,6 @@ static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
  
  gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {}
  
-/*! \brief Return whether the simulation described can run a 1D DD.
- *
- * The GPU halo exchange code requires 1D DD. Such a DD
- * generally requires a larger box than other possible decompositions
- * with the same rank count, so the calling code might need to decide
- * what is the most appropriate way to run the simulation based on
- * whether such a DD is possible.
- *
- * This function works like init_domain_decomposition(), but will not
- * give a fatal error, and only uses \c cr for communicating between
- * ranks.
- *
- * It is safe to call before thread-MPI spawns ranks, so that
- * thread-MPI can decide whether and how to trigger the GPU halo
- * exchange code path. The number of PME ranks, if any, should be set
- * in \c options.numPmeRanks.
- */
-static bool canMake1DDomainDecomposition(const DDSettings&              ddSettingsOriginal,
-                                         DDRole                         ddRole,
-                                         MPI_Comm                       communicator,
-                                         const int                      numRanksRequested,
-                                         const DomdecOptions&           options,
-                                         const gmx_mtop_t&              mtop,
-                                         const t_inputrec&              ir,
-                                         const matrix                   box,
-                                         gmx::ArrayRef<const gmx::RVec> xGlobal)
-{
-    // Ensure we don't write any output from this checking routine
-    gmx::MDLogger dummyLogger;
-
-    DDSystemInfo systemInfo =
-            getSystemInfo(dummyLogger, ddRole, communicator, options, mtop, ir, box, xGlobal);
-
-    DDSettings ddSettings = ddSettingsOriginal;
-    ddSettings.request1D  = true;
-    const real gridSetupCellsizeLimit =
-            getDDGridSetupCellSizeLimit(dummyLogger, !isDlbDisabled(ddSettings.initialDlbState),
-                                        options.dlbScaling, ir, systemInfo.cellsizeLimit);
-    gmx_ddbox_t ddbox = { 0 };
-    DDGridSetup ddGridSetup =
-            getDDGridSetup(dummyLogger, ddRole, communicator, numRanksRequested, options, ddSettings,
-                           systemInfo, gridSetupCellsizeLimit, mtop, ir, box, xGlobal, &ddbox);
-
-    const bool canMake1DDD = (ddGridSetup.numDomains[XX] != 0);
-
-    return canMake1DDD;
-}
-
-bool is1D(const gmx_domdec_t& dd)
-{
-    const int maxDimensionSize = std::max(dd.numCells[XX], std::max(dd.numCells[YY], dd.numCells[ZZ]));
-    const int  productOfDimensionSizes      = dd.numCells[XX] * dd.numCells[YY] * dd.numCells[ZZ];
-    const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes);
-
-    return decompositionHasOneDimension;
-}
-
  namespace gmx
  {
  
@@ -2975,7 +2917,6 @@ public:
           t_commrec*           cr,
           const DomdecOptions& options,
           const MdrunOptions&  mdrunOptions,
-         bool                 prefer1D,
           const gmx_mtop_t&    mtop,
           const t_inputrec&    ir,
           const matrix         box,
@@ -3023,7 +2964,6 @@ DomainDecompositionBuilder::Impl::Impl(const MDLogger&      mdlog,
                                         t_commrec*           cr,
                                         const DomdecOptions& options,
                                         const MdrunOptions&  mdrunOptions,
-                                       const bool           prefer1D,
                                         const gmx_mtop_t&    mtop,
                                         const t_inputrec&    ir,
                                         const matrix         box,
@@ -3038,14 +2978,6 @@ DomainDecompositionBuilder::Impl::Impl(const MDLogger&      mdlog,
  
      ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_);
  
-    if (prefer1D
-        && canMake1DDomainDecomposition(ddSettings_, MASTER(cr_) ? DDRole::Master : DDRole::Agent,
-                                        cr->mpiDefaultCommunicator, cr_->sizeOfDefaultCommunicator,
-                                        options_, mtop_, ir_, box, xGlobal))
-    {
-        ddSettings_.request1D = true;
-    }
-
      if (ddSettings_.eFlop > 1)
      {
          /* Ensure that we have different random flop counts on different ranks */
@@ -3118,12 +3050,11 @@ DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&      mdlo
                                                         t_commrec*           cr,
                                                         const DomdecOptions& options,
                                                         const MdrunOptions&  mdrunOptions,
-                                                       const bool           prefer1D,
                                                         const gmx_mtop_t&    mtop,
                                                         const t_inputrec&    ir,
                                                         const matrix         box,
                                                         ArrayRef<const RVec> xGlobal) :
-    impl_(new Impl(mdlog, cr, options, mdrunOptions, prefer1D, mtop, ir, box, xGlobal))
+    impl_(new Impl(mdlog, cr, options, mdrunOptions, mtop, ir, box, xGlobal))
  {
  }
  
@@ -3227,9 +3158,8 @@ void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
      GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
                         "Non-local non-bonded stream should be valid when using "
                         "GPU halo exchange.");
-    int gpuHaloExchangeSize = 0;
-    int pulseStart          = 0;
-    if (cr.dd->gpuHaloExchange.empty())
+
+    if (cr.dd->gpuHaloExchange[0].empty())
      {
          GMX_LOG(mdlog.warning)
                  .asParagraph()
@@ -3238,17 +3168,13 @@ void constructGpuHaloExchange(const gmx::MDLogger&            mdlog,
                          "by the "
                          "GMX_GPU_DD_COMMS environment variable.");
      }
-    else
-    {
-        gpuHaloExchangeSize = static_cast<int>(cr.dd->gpuHaloExchange.size());
-        pulseStart          = gpuHaloExchangeSize - 1;
-    }
-    if (cr.dd->comm->cd[0].numPulses() > gpuHaloExchangeSize)
+
+    for (int d = 0; d < cr.dd->ndim; d++)
      {
-        for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+        for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
          {
-            cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
-                    cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(),
+            cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
+                    cr.dd, d, cr.mpi_comm_mysim, deviceStreamManager.context(),
                      deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
                      deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse, wcycle));
          }
@@ -3259,9 +3185,12 @@ void reinitGpuHaloExchange(const t_commrec&              cr,
                             const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
                             const DeviceBuffer<gmx::RVec> d_forcesBuffer)
  {
-    for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+    for (int d = 0; d < cr.dd->ndim; d++)
      {
-        cr.dd->gpuHaloExchange[pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
+        for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
+        {
+            cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
+        }
      }
  }
  
@@ -3269,16 +3198,22 @@ void communicateGpuHaloCoordinates(const t_commrec&      cr,
                                     const matrix          box,
                                     GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
  {
-    for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+    for (int d = 0; d < cr.dd->ndim; d++)
      {
-        cr.dd->gpuHaloExchange[pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+        for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
+        {
+            cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+        }
      }
  }
  
  void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
  {
-    for (int pulse = cr.dd->comm->cd[0].numPulses() - 1; pulse >= 0; pulse--)
+    for (int d = cr.dd->ndim - 1; d >= 0; d--)
      {
-        cr.dd->gpuHaloExchange[pulse]->communicateHaloForces(accumulateForces);
+        for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
+        {
+            cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
+        }
      }
  }
diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h

index b6c79dca00bba143412327ea17bd55280d832e97..2b65e1989b822eb638f3375b86f29fe0d1f2c681 100644 (file)
--- a/src/gromacs/domdec/domdec.h
+++ b/src/gromacs/domdec/domdec.h
@@ -156,13 +156,6 @@ bool ddHaveSplitConstraints(const gmx_domdec_t& dd);
  /*! \brief Return whether update groups are used */
  bool ddUsesUpdateGroups(const gmx_domdec_t& dd);
  
-/*! \brief Return whether the DD has a single dimension
- *
- * The GPU halo exchange code requires a 1D DD, and its setup code can
- * use the returned value to understand what it should do.
- */
-bool is1D(const gmx_domdec_t& dd);
-
  /*! \brief Initialize data structures for bonded interactions */
  void dd_init_bondeds(FILE*                           fplog,
                       gmx_domdec_t*                   dd,
diff --git a/src/gromacs/domdec/domdec_internal.h b/src/gromacs/domdec/domdec_internal.h

index 7bee8f819857e1ea3d10c6c48a04b9573ccedc47..691b0b87aa4cef053579f538791b82c868beb2d7 100644 (file)
--- a/src/gromacs/domdec/domdec_internal.h
+++ b/src/gromacs/domdec/domdec_internal.h
@@ -473,9 +473,6 @@ struct DDSettings
      //! Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise
      int eFlop = 0;
  
-    //! Request 1D domain decomposition
-    bool request1D;
-
      //! Whether to order the DD dimensions from z to x
      bool useDDOrderZYX = false;
  
diff --git a/src/gromacs/domdec/domdec_setup.cpp b/src/gromacs/domdec/domdec_setup.cpp

index d4c7dc342e4edef859419fe5bcaa47b45a8dda01..1b058becc5b7c0168faa3609c8a6783e9ed8e2e1 100644 (file)
--- a/src/gromacs/domdec/domdec_setup.cpp
+++ b/src/gromacs/domdec/domdec_setup.cpp
@@ -553,7 +553,6 @@ static float comm_cost_est(real               limit,
  /*! \brief Assign penalty factors to possible domain decompositions,
   * based on the estimated communication costs. */
  static void assign_factors(const real         limit,
-                           const bool         request1D,
                             const real         cutoff,
                             const matrix       box,
                             const gmx_ddbox_t& ddbox,
@@ -573,14 +572,6 @@ static void assign_factors(const real         limit,
  
      if (ndiv == 0)
      {
-        const int  maxDimensionSize        = std::max(ir_try[XX], std::max(ir_try[YY], ir_try[ZZ]));
-        const int  productOfDimensionSizes = ir_try[XX] * ir_try[YY] * ir_try[ZZ];
-        const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes);
-        if (request1D && !decompositionHasOneDimension)
-        {
-            /* We requested 1D DD, but got multiple dimensions */
-            return;
-        }
  
          ce = comm_cost_est(limit, cutoff, box, ddbox, natoms, ir, pbcdxr, npme, ir_try);
          if (ce >= 0
@@ -611,8 +602,8 @@ static void assign_factors(const real         limit,
              }
  
              /* recurse */
-            assign_factors(limit, request1D, cutoff, box, ddbox, natoms, ir, pbcdxr, npme, ndiv - 1,
-                           div + 1, mdiv + 1, irTryPtr, opt);
+            assign_factors(limit, cutoff, box, ddbox, natoms, ir, pbcdxr, npme, ndiv - 1, div + 1,
+                           mdiv + 1, irTryPtr, opt);
  
              for (i = 0; i < mdiv[0] - x - y; i++)
              {
@@ -639,7 +630,6 @@ static gmx::IVec optimizeDDCells(const gmx::MDLogger& mdlog,
                                   const int            numRanksRequested,
                                   const int            numPmeOnlyRanks,
                                   const real           cellSizeLimit,
-                                 const bool           request1D,
                                   const gmx_mtop_t&    mtop,
                                   const matrix         box,
                                   const gmx_ddbox_t&   ddbox,
@@ -716,7 +706,7 @@ static gmx::IVec optimizeDDCells(const gmx::MDLogger& mdlog,
  
      gmx::IVec itry       = { 1, 1, 1 };
      gmx::IVec numDomains = { 0, 0, 0 };
-    assign_factors(cellSizeLimit, request1D, systemInfo.cutoff, box, ddbox, mtop.natoms, ir, pbcdxr,
+    assign_factors(cellSizeLimit, systemInfo.cutoff, box, ddbox, mtop.natoms, ir, pbcdxr,
                     numRanksDoingPmeWork, div.size(), div.data(), mdiv.data(), &itry, &numDomains);
  
      return numDomains;
@@ -914,14 +904,6 @@ DDGridSetup getDDGridSetup(const gmx::MDLogger&           mdlog,
  {
      int numPmeOnlyRanks = getNumPmeOnlyRanksToUse(mdlog, options, mtop, ir, box, numRanksRequested);
  
-    if (ddSettings.request1D && (numRanksRequested - numPmeOnlyRanks == 1))
-    {
-        // With only one PP rank, there will not be a need for
-        // GPU-based halo exchange that wants to request that any DD
-        // has only 1 dimension.
-        return DDGridSetup{};
-    }
-
      gmx::IVec numDomains;
      if (options.numCells[XX] > 0)
      {
@@ -936,7 +918,7 @@ DDGridSetup getDDGridSetup(const gmx::MDLogger&           mdlog,
          if (ddRole == DDRole::Master)
          {
              numDomains = optimizeDDCells(mdlog, numRanksRequested, numPmeOnlyRanks, cellSizeLimit,
-                                         ddSettings.request1D, mtop, box, *ddbox, ir, systemInfo);
+                                         mtop, box, *ddbox, ir, systemInfo);
          }
      }
  
diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h

index f247c4bb8c19b22c4ee12fd9411115bf19a4b3a3..db67680fb1465e67889d9f6ce471a3dba34668af 100644 (file)
--- a/src/gromacs/domdec/domdec_struct.h
+++ b/src/gromacs/domdec/domdec_struct.h
@@ -236,8 +236,8 @@ struct gmx_domdec_t
      /* gmx_pme_recv_f buffer */
      std::vector<gmx::RVec> pmeForceReceiveBuffer;
  
-    /* GPU halo exchange object */
-    std::vector<std::unique_ptr<gmx::GpuHaloExchange>> gpuHaloExchange;
+    /* GPU halo exchange objects: this structure supports a vector of pulses for each dimension */
+    std::vector<std::unique_ptr<gmx::GpuHaloExchange>> gpuHaloExchange[DIM];
  };
  
  //! Are we the master node for domain decomposition
diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h

index 9482009410c060ca3ebe880e5473fcdb89f7b76e..6f4586098e3de1632885f67a88378bd177aaccf9 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange.h
+++ b/src/gromacs/domdec/gpuhaloexchange.h
@@ -82,6 +82,7 @@ public:
       * does not yet support virial steps.
       *
       * \param [inout] dd                       domdec structure
+     * \param [in]    dimIndex                 the dimension index for this instance
       * \param [in]    mpi_comm_mysim           communicator used for simulation
       * \param [in]    deviceContext            GPU device context
       * \param [in]    streamLocal              local NB CUDA stream.
@@ -90,6 +91,7 @@ public:
       * \param [in]    wcycle                   The wallclock counter
       */
      GpuHaloExchange(gmx_domdec_t*        dd,
+                    int                  dimIndex,
                      MPI_Comm             mpi_comm_mysim,
                      const DeviceContext& deviceContext,
                      const DeviceStream&  streamLocal,
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp

index eaffb5d37111f99c7a04c78743b3234a20ffeae4..64221f0257deece465f0dc2f831f14e9c8b696e7 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp
@@ -62,6 +62,7 @@ class GpuHaloExchange::Impl
  
  /*!\brief Constructor stub. */
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
+                                 int /* dimIndex */,
                                   MPI_Comm /* mpi_comm_mysim */,
                                   const DeviceContext& /* deviceContext */,
                                   const DeviceStream& /*streamLocal */,
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu

index cbcff3defbc7274da10cbbbd2119fdb4cf129476..d519445f74d48d0dd2fbea7b1f623e3b7c8c5371 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -136,10 +136,30 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
      d_x_ = d_coordinatesBuffer;
      d_f_ = d_forcesBuffer;
  
-    const gmx_domdec_comm_t&     comm    = *dd_->comm;
-    const gmx_domdec_comm_dim_t& cd      = comm.cd[0];
-    const gmx_domdec_ind_t&      ind     = cd.ind[pulse_];
-    int                          newSize = ind.nsend[nzone_ + 1];
+    const gmx_domdec_comm_t&     comm = *dd_->comm;
+    const gmx_domdec_comm_dim_t& cd   = comm.cd[dimIndex_];
+    const gmx_domdec_ind_t&      ind  = cd.ind[pulse_];
+
+    numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); // offset for data recieved by this rank
+
+    // Determine receive offset for the dimension index and pulse of this halo exchange object
+    int numZoneTemp   = 1;
+    int numZone       = 0;
+    int numAtomsTotal = numHomeAtoms_;
+    for (int i = 0; i <= dimIndex_; i++)
+    {
+        int pulseMax = (i == dimIndex_) ? pulse_ : (comm.cd[i].numPulses() - 1);
+        for (int p = 0; p <= pulseMax; p++)
+        {
+            atomOffset_                     = numAtomsTotal;
+            const gmx_domdec_ind_t& indTemp = comm.cd[i].ind[p];
+            numAtomsTotal += indTemp.nrecv[numZoneTemp + 1];
+        }
+        numZone = numZoneTemp;
+        numZoneTemp += numZoneTemp;
+    }
+
+    int newSize = ind.nsend[numZone + 1];
  
      GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
  
@@ -162,28 +182,24 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
      fSendSize_ = xRecvSize_;
      fRecvSize_ = xSendSize_;
  
-    numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); // offset for data recieved by this rank
-
-    GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch");
-    std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
-
-    copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_,
-                       GpuApiCallBehavior::Async, nullptr);
+    if (newSize > 0)
+    {
+        GMX_ASSERT(ind.index.size() == h_indexMap_.size(),
+                   "Size mismatch between domain decomposition communication index array and GPU "
+                   "halo exchange index mapping array");
+        std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
  
+        copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_,
+                           GpuApiCallBehavior::Async, nullptr);
+    }
      // This rank will push data to its neighbor, so needs to know
      // the remote receive address and similarly send its receive
      // address to other neighbour. We can do this here in reinit fn
      // since the pointers will not change until the next NS step.
  
      // Coordinates buffer:
+    void* recvPtr = static_cast<void*>(&d_x_[atomOffset_]);
  #if GMX_MPI
-    int pulseOffset = 0;
-    for (int p = pulse_ - 1; p >= 0; p--)
-    {
-        pulseOffset += cd.ind[p].nrecv[nzone_ + 1];
-    }
-    //    void* recvPtr = static_cast<void*>(&d_coordinatesBuffer[numHomeAtoms_ + pulseOffset]);
-    void* recvPtr = static_cast<void*>(&d_x_[numHomeAtoms_ + pulseOffset]);
      MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, &remoteXPtr_, sizeof(void*),
                   MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
  
@@ -228,12 +244,9 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
      // performing pressure coupling. So, for simplicity, the box
      // is used every step to pass the shift vector as an argument of
      // the packing kernel.
-    //
-    // Because only one-dimensional DD is supported, the coordinate
-    // shift only needs to handle that dimension.
-    const int    dimensionIndex = dd_->dim[0];
-    const float3 coordinateShift{ box[dimensionIndex][XX], box[dimensionIndex][YY],
-                                  box[dimensionIndex][ZZ] };
+    const int    boxDimensionIndex = dd_->dim[dimIndex_];
+    const float3 coordinateShift{ box[boxDimensionIndex][XX], box[boxDimensionIndex][YY],
+                                  box[boxDimensionIndex][ZZ] };
  
      // Avoid launching kernel when there is no work to do
      if (size > 0)
@@ -278,8 +291,11 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
      wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
  
      float3* d_f = d_f_;
-
-    if (pulse_ == (dd_->comm->cd[0].numPulses() - 1))
+    // If this is the last pulse and index (noting the force halo
+    // exchanges across multiple pulses and indices are called in
+    // reverse order) then perform the following preparation
+    // activities
+    if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
      {
          if (!accumulateForces)
          {
@@ -311,11 +327,11 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
      const int*    indexMap = d_indexMap_;
      const int     size     = fRecvSize_;
  
-    if (pulse_ > 0)
+    if (pulse_ > 0 || dd_->ndim > 1)
      {
          // We need to accumulate rather than set, since it is possible
-        // that, in this pulse, a value could be written to a location
-        // corresponding to the halo region of a following pulse.
+        // that, in this pulse/dim, a value could be written to a location
+        // corresponding to the halo region of a following pulse/dim.
          accumulateForces = true;
      }
  
@@ -374,12 +390,7 @@ void GpuHaloExchange::Impl::communicateHaloData(float3*               d_ptr,
      }
      else
      {
-        int recvOffset = dd_->comm->atomRanges.end(DDAtomRanges::Type::Zones);
-        for (int p = pulse_; p < dd_->comm->cd[0].numPulses(); p++)
-        {
-            recvOffset -= dd_->comm->cd[0].ind[p].nrecv[nzone_ + 1];
-        }
-        sendPtr   = static_cast<void*>(&(d_ptr[recvOffset]));
+        sendPtr   = static_cast<void*>(&(d_ptr[atomOffset_]));
          sendSize  = fSendSize_;
          remotePtr = remoteFPtr_;
          sendRank  = sendRankF_;
@@ -440,6 +451,7 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent()
  
  /*! \brief Create Domdec GPU object */
  GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
+                            int                  dimIndex,
                              MPI_Comm             mpi_comm_mysim,
                              const DeviceContext& deviceContext,
                              const DeviceStream&  localStream,
@@ -447,11 +459,12 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
                              int                  pulse,
                              gmx_wallcycle*       wcycle) :
      dd_(dd),
-    sendRankX_(dd->neighbor[0][1]),
-    recvRankX_(dd->neighbor[0][0]),
-    sendRankF_(dd->neighbor[0][0]),
-    recvRankF_(dd->neighbor[0][1]),
-    usePBC_(dd->ci[dd->dim[0]] == 0),
+    dimIndex_(dimIndex),
+    sendRankX_(dd->neighbor[dimIndex][1]),
+    recvRankX_(dd->neighbor[dimIndex][0]),
+    sendRankF_(dd->neighbor[dimIndex][0]),
+    recvRankF_(dd->neighbor[dimIndex][1]),
+    usePBC_(dd->ci[dd->dim[dimIndex]] == 0),
      haloDataTransferLaunched_(new GpuEventSynchronizer()),
      mpi_comm_mysim_(mpi_comm_mysim),
      deviceContext_(deviceContext),
@@ -464,11 +477,6 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t*        dd,
      GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
                         "GPU Halo exchange is currently only supported with thread-MPI enabled");
  
-    if (dd->ndim > 1)
-    {
-        gmx_fatal(FARGS, "Error: dd->ndim > 1 is not yet supported in GPU halo exchange");
-    }
-
      if (usePBC_ && dd->unitCellInfo.haveScrewPBC)
      {
          gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n");
@@ -489,13 +497,14 @@ GpuHaloExchange::Impl::~Impl()
  }
  
  GpuHaloExchange::GpuHaloExchange(gmx_domdec_t*        dd,
+                                 int                  dimIndex,
                                   MPI_Comm             mpi_comm_mysim,
                                   const DeviceContext& deviceContext,
                                   const DeviceStream&  localStream,
                                   const DeviceStream&  nonLocalStream,
                                   int                  pulse,
                                   gmx_wallcycle*       wcycle) :
-    impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
+    impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
  {
  }
  
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh

index 77aaf3f9bfe1e5f5713cc90bd565d060f745aa0f..761938a0133c3c5f5645d17e2121d818356ca9a7 100644 (file)
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh
@@ -72,6 +72,7 @@ public:
      /*! \brief Creates GPU Halo Exchange object.
       *
       * \param [inout] dd                       domdec structure
+     * \param [in]    dimIndex                 the dimension index for this instance
       * \param [in]    mpi_comm_mysim           communicator used for simulation
       * \param [in]    deviceContext            GPU device context
       * \param [in]    localStream              local NB CUDA stream
@@ -80,6 +81,7 @@ public:
       * \param [in]    wcycle                   The wallclock counter
       */
      Impl(gmx_domdec_t*        dd,
+         int                  dimIndex,
           MPI_Comm             mpi_comm_mysim,
           const DeviceContext& deviceContext,
           const DeviceStream&  localStream,
@@ -198,12 +200,16 @@ private:
      float3* d_f_ = nullptr;
      //! An event recorded once the exchanged forces are ready on the GPU
      GpuEventSynchronizer fReadyOnDevice_;
+    //! The dimension index corresponding to this halo exchange instance
+    int dimIndex_ = 0;
      //! The pulse corresponding to this halo exchange instance
      int pulse_ = 0;
      //! Number of zones. Always 1 for 1-D case.
      const int nzone_ = 1;
      //! The wallclock counter
      gmx_wallcycle* wcycle_ = nullptr;
+    //! The atom offset for receive (x) or send (f) for dimension index and pulse corresponding to this halo exchange instance
+    int atomOffset_ = 0;
  };
  
  } // namespace gmx
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 11da2a775dead945da991e0b0010687d05b816cb..f8befd3ab784c3ee78ed8a81a9be5044c99338c9 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -1150,7 +1150,7 @@ void do_force(FILE*                               fplog,
      // operations were checked before construction, so here we can
      // just use it and assert upon any conditions.
      const bool ddUsesGpuDirectCommunication =
-            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty()));
+            ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty()));
      GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
                 "Must use coordinate buffer ops with GPU halo exchange");
      const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
@@ -1886,7 +1886,7 @@ void do_force(FILE*                               fplog,
              }
              if (useGpuForcesHaloExchange)
              {
-                dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent());
+                dependencyList.push_back(cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
              }
              nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
                                                dependencyList, stepWork.useGpuPmeFReduction,
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index 9edb2f4c5504cd0234c6ffbd42ef35c07bac8f0e..4d306410e55f190b99246ea33f5b3045ef74d455 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -849,13 +849,11 @@ void gmx::LegacySimulator::do_md()
          }
  
          // Allocate or re-size GPU halo exchange object, if necessary
-        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
-            && useGpuForNonbonded && is1D(*cr->dd))
+        if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange && useGpuForNonbonded)
          {
              GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr,
                                 "GPU device manager has to be initialized to use GPU "
                                 "version of halo exchange.");
-            // TODO remove need to pass local stream into GPU halo exchange - Issue #3093
              constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle);
          }
  
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 301a02ffe2e1be16d253cda408c178809087bda3..e774622f2dac537f6214aa68f7ef749a108a0f16 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -1157,7 +1157,6 @@ int Mdrunner::mdrunner()
                            useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
                            *hwinfo->cpuInfo);
  
-    const bool prefer1DAnd1PulseDD = (devFlags.enableGpuHaloExchange && useGpuForNonbonded);
      // This builder is necessary while we have multi-part construction
      // of DD. Before DD is constructed, we use the existence of
      // the builder object to indicate that further construction of DD
@@ -1166,7 +1165,7 @@ int Mdrunner::mdrunner()
      if (useDomainDecomposition)
      {
          ddBuilder = std::make_unique<DomainDecompositionBuilder>(
-                mdlog, cr, domdecOptions, mdrunOptions, prefer1DAnd1PulseDD, mtop, *inputrec, box,
+                mdlog, cr, domdecOptions, mdrunOptions, mtop, *inputrec, box,
                  positionsFromStatePointer(globalState.get()));
      }
      else
author	Alan Gray <alangray3@gmail.com>
	Mon, 28 Sep 2020 20:53:07 +0000 (20:53 +0000)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Mon, 28 Sep 2020 20:53:07 +0000 (20:53 +0000)
src/gromacs/domdec/builder.h		patch \| blob \| history
src/gromacs/domdec/domdec.cpp		patch \| blob \| history
src/gromacs/domdec/domdec.h		patch \| blob \| history
src/gromacs/domdec/domdec_internal.h		patch \| blob \| history
src/gromacs/domdec/domdec_setup.cpp		patch \| blob \| history
src/gromacs/domdec/domdec_struct.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange.h		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cpp		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cu		patch \| blob \| history
src/gromacs/domdec/gpuhaloexchange_impl.cuh		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history