From: Alan Gray Date: Mon, 28 Sep 2020 20:53:07 +0000 (+0000) Subject: Remove single-dimension limitation from GPU halo exchange X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=e6650c000e5e243fea99649ceec47864d07ccc0a;p=alexxy%2Fgromacs.git Remove single-dimension limitation from GPU halo exchange Allows GPU halo exchange to be active when the number of dimensions is greater than 1, thus simplifying the codepath logic. Partly addresses #3370 --- diff --git a/src/gromacs/domdec/builder.h b/src/gromacs/domdec/builder.h index ea31080c62..01239c8058 100644 --- a/src/gromacs/domdec/builder.h +++ b/src/gromacs/domdec/builder.h @@ -79,7 +79,6 @@ public: t_commrec* cr, const DomdecOptions& options, const MdrunOptions& mdrunOptions, - bool prefer1D, const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box, diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index ddb74cbc3d..d3ba5aba08 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -2865,7 +2865,6 @@ static DDSettings getDDSettings(const gmx::MDLogger& mdlog, ddSettings.useSendRecv2 = (dd_getenv(mdlog, "GMX_DD_USE_SENDRECV2", 0) != 0); ddSettings.dlb_scale_lim = dd_getenv(mdlog, "GMX_DLB_MAX_BOX_SCALING", 10); - ddSettings.request1D = bool(dd_getenv(mdlog, "GMX_DD_1D", 0)); ddSettings.useDDOrderZYX = bool(dd_getenv(mdlog, "GMX_DD_ORDER_ZYX", 0)); ddSettings.useCartesianReorder = bool(dd_getenv(mdlog, "GMX_NO_CART_REORDER", 1)); ddSettings.eFlop = dd_getenv(mdlog, "GMX_DLB_BASED_ON_FLOPS", 0); @@ -2904,63 +2903,6 @@ static DDSettings getDDSettings(const gmx::MDLogger& mdlog, gmx_domdec_t::gmx_domdec_t(const t_inputrec& ir) : unitCellInfo(ir) {} -/*! \brief Return whether the simulation described can run a 1D DD. - * - * The GPU halo exchange code requires 1D DD. Such a DD - * generally requires a larger box than other possible decompositions - * with the same rank count, so the calling code might need to decide - * what is the most appropriate way to run the simulation based on - * whether such a DD is possible. - * - * This function works like init_domain_decomposition(), but will not - * give a fatal error, and only uses \c cr for communicating between - * ranks. - * - * It is safe to call before thread-MPI spawns ranks, so that - * thread-MPI can decide whether and how to trigger the GPU halo - * exchange code path. The number of PME ranks, if any, should be set - * in \c options.numPmeRanks. - */ -static bool canMake1DDomainDecomposition(const DDSettings& ddSettingsOriginal, - DDRole ddRole, - MPI_Comm communicator, - const int numRanksRequested, - const DomdecOptions& options, - const gmx_mtop_t& mtop, - const t_inputrec& ir, - const matrix box, - gmx::ArrayRef xGlobal) -{ - // Ensure we don't write any output from this checking routine - gmx::MDLogger dummyLogger; - - DDSystemInfo systemInfo = - getSystemInfo(dummyLogger, ddRole, communicator, options, mtop, ir, box, xGlobal); - - DDSettings ddSettings = ddSettingsOriginal; - ddSettings.request1D = true; - const real gridSetupCellsizeLimit = - getDDGridSetupCellSizeLimit(dummyLogger, !isDlbDisabled(ddSettings.initialDlbState), - options.dlbScaling, ir, systemInfo.cellsizeLimit); - gmx_ddbox_t ddbox = { 0 }; - DDGridSetup ddGridSetup = - getDDGridSetup(dummyLogger, ddRole, communicator, numRanksRequested, options, ddSettings, - systemInfo, gridSetupCellsizeLimit, mtop, ir, box, xGlobal, &ddbox); - - const bool canMake1DDD = (ddGridSetup.numDomains[XX] != 0); - - return canMake1DDD; -} - -bool is1D(const gmx_domdec_t& dd) -{ - const int maxDimensionSize = std::max(dd.numCells[XX], std::max(dd.numCells[YY], dd.numCells[ZZ])); - const int productOfDimensionSizes = dd.numCells[XX] * dd.numCells[YY] * dd.numCells[ZZ]; - const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes); - - return decompositionHasOneDimension; -} - namespace gmx { @@ -2975,7 +2917,6 @@ public: t_commrec* cr, const DomdecOptions& options, const MdrunOptions& mdrunOptions, - bool prefer1D, const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box, @@ -3023,7 +2964,6 @@ DomainDecompositionBuilder::Impl::Impl(const MDLogger& mdlog, t_commrec* cr, const DomdecOptions& options, const MdrunOptions& mdrunOptions, - const bool prefer1D, const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box, @@ -3038,14 +2978,6 @@ DomainDecompositionBuilder::Impl::Impl(const MDLogger& mdlog, ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_); - if (prefer1D - && canMake1DDomainDecomposition(ddSettings_, MASTER(cr_) ? DDRole::Master : DDRole::Agent, - cr->mpiDefaultCommunicator, cr_->sizeOfDefaultCommunicator, - options_, mtop_, ir_, box, xGlobal)) - { - ddSettings_.request1D = true; - } - if (ddSettings_.eFlop > 1) { /* Ensure that we have different random flop counts on different ranks */ @@ -3118,12 +3050,11 @@ DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger& mdlo t_commrec* cr, const DomdecOptions& options, const MdrunOptions& mdrunOptions, - const bool prefer1D, const gmx_mtop_t& mtop, const t_inputrec& ir, const matrix box, ArrayRef xGlobal) : - impl_(new Impl(mdlog, cr, options, mdrunOptions, prefer1D, mtop, ir, box, xGlobal)) + impl_(new Impl(mdlog, cr, options, mdrunOptions, mtop, ir, box, xGlobal)) { } @@ -3227,9 +3158,8 @@ void constructGpuHaloExchange(const gmx::MDLogger& mdlog, GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal), "Non-local non-bonded stream should be valid when using " "GPU halo exchange."); - int gpuHaloExchangeSize = 0; - int pulseStart = 0; - if (cr.dd->gpuHaloExchange.empty()) + + if (cr.dd->gpuHaloExchange[0].empty()) { GMX_LOG(mdlog.warning) .asParagraph() @@ -3238,17 +3168,13 @@ void constructGpuHaloExchange(const gmx::MDLogger& mdlog, "by the " "GMX_GPU_DD_COMMS environment variable."); } - else - { - gpuHaloExchangeSize = static_cast(cr.dd->gpuHaloExchange.size()); - pulseStart = gpuHaloExchangeSize - 1; - } - if (cr.dd->comm->cd[0].numPulses() > gpuHaloExchangeSize) + + for (int d = 0; d < cr.dd->ndim; d++) { - for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++) + for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++) { - cr.dd->gpuHaloExchange.push_back(std::make_unique( - cr.dd, cr.mpi_comm_mysim, deviceStreamManager.context(), + cr.dd->gpuHaloExchange[d].push_back(std::make_unique( + cr.dd, d, cr.mpi_comm_mysim, deviceStreamManager.context(), deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal), deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), pulse, wcycle)); } @@ -3259,9 +3185,12 @@ void reinitGpuHaloExchange(const t_commrec& cr, const DeviceBuffer d_coordinatesBuffer, const DeviceBuffer d_forcesBuffer) { - for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++) + for (int d = 0; d < cr.dd->ndim; d++) { - cr.dd->gpuHaloExchange[pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer); + for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++) + { + cr.dd->gpuHaloExchange[d][pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer); + } } } @@ -3269,16 +3198,22 @@ void communicateGpuHaloCoordinates(const t_commrec& cr, const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) { - for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++) + for (int d = 0; d < cr.dd->ndim; d++) { - cr.dd->gpuHaloExchange[pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent); + for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++) + { + cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent); + } } } void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces) { - for (int pulse = cr.dd->comm->cd[0].numPulses() - 1; pulse >= 0; pulse--) + for (int d = cr.dd->ndim - 1; d >= 0; d--) { - cr.dd->gpuHaloExchange[pulse]->communicateHaloForces(accumulateForces); + for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--) + { + cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces); + } } } diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h index b6c79dca00..2b65e1989b 100644 --- a/src/gromacs/domdec/domdec.h +++ b/src/gromacs/domdec/domdec.h @@ -156,13 +156,6 @@ bool ddHaveSplitConstraints(const gmx_domdec_t& dd); /*! \brief Return whether update groups are used */ bool ddUsesUpdateGroups(const gmx_domdec_t& dd); -/*! \brief Return whether the DD has a single dimension - * - * The GPU halo exchange code requires a 1D DD, and its setup code can - * use the returned value to understand what it should do. - */ -bool is1D(const gmx_domdec_t& dd); - /*! \brief Initialize data structures for bonded interactions */ void dd_init_bondeds(FILE* fplog, gmx_domdec_t* dd, diff --git a/src/gromacs/domdec/domdec_internal.h b/src/gromacs/domdec/domdec_internal.h index 7bee8f8198..691b0b87aa 100644 --- a/src/gromacs/domdec/domdec_internal.h +++ b/src/gromacs/domdec/domdec_internal.h @@ -473,9 +473,6 @@ struct DDSettings //! Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise int eFlop = 0; - //! Request 1D domain decomposition - bool request1D; - //! Whether to order the DD dimensions from z to x bool useDDOrderZYX = false; diff --git a/src/gromacs/domdec/domdec_setup.cpp b/src/gromacs/domdec/domdec_setup.cpp index d4c7dc342e..1b058becc5 100644 --- a/src/gromacs/domdec/domdec_setup.cpp +++ b/src/gromacs/domdec/domdec_setup.cpp @@ -553,7 +553,6 @@ static float comm_cost_est(real limit, /*! \brief Assign penalty factors to possible domain decompositions, * based on the estimated communication costs. */ static void assign_factors(const real limit, - const bool request1D, const real cutoff, const matrix box, const gmx_ddbox_t& ddbox, @@ -573,14 +572,6 @@ static void assign_factors(const real limit, if (ndiv == 0) { - const int maxDimensionSize = std::max(ir_try[XX], std::max(ir_try[YY], ir_try[ZZ])); - const int productOfDimensionSizes = ir_try[XX] * ir_try[YY] * ir_try[ZZ]; - const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes); - if (request1D && !decompositionHasOneDimension) - { - /* We requested 1D DD, but got multiple dimensions */ - return; - } ce = comm_cost_est(limit, cutoff, box, ddbox, natoms, ir, pbcdxr, npme, ir_try); if (ce >= 0 @@ -611,8 +602,8 @@ static void assign_factors(const real limit, } /* recurse */ - assign_factors(limit, request1D, cutoff, box, ddbox, natoms, ir, pbcdxr, npme, ndiv - 1, - div + 1, mdiv + 1, irTryPtr, opt); + assign_factors(limit, cutoff, box, ddbox, natoms, ir, pbcdxr, npme, ndiv - 1, div + 1, + mdiv + 1, irTryPtr, opt); for (i = 0; i < mdiv[0] - x - y; i++) { @@ -639,7 +630,6 @@ static gmx::IVec optimizeDDCells(const gmx::MDLogger& mdlog, const int numRanksRequested, const int numPmeOnlyRanks, const real cellSizeLimit, - const bool request1D, const gmx_mtop_t& mtop, const matrix box, const gmx_ddbox_t& ddbox, @@ -716,7 +706,7 @@ static gmx::IVec optimizeDDCells(const gmx::MDLogger& mdlog, gmx::IVec itry = { 1, 1, 1 }; gmx::IVec numDomains = { 0, 0, 0 }; - assign_factors(cellSizeLimit, request1D, systemInfo.cutoff, box, ddbox, mtop.natoms, ir, pbcdxr, + assign_factors(cellSizeLimit, systemInfo.cutoff, box, ddbox, mtop.natoms, ir, pbcdxr, numRanksDoingPmeWork, div.size(), div.data(), mdiv.data(), &itry, &numDomains); return numDomains; @@ -914,14 +904,6 @@ DDGridSetup getDDGridSetup(const gmx::MDLogger& mdlog, { int numPmeOnlyRanks = getNumPmeOnlyRanksToUse(mdlog, options, mtop, ir, box, numRanksRequested); - if (ddSettings.request1D && (numRanksRequested - numPmeOnlyRanks == 1)) - { - // With only one PP rank, there will not be a need for - // GPU-based halo exchange that wants to request that any DD - // has only 1 dimension. - return DDGridSetup{}; - } - gmx::IVec numDomains; if (options.numCells[XX] > 0) { @@ -936,7 +918,7 @@ DDGridSetup getDDGridSetup(const gmx::MDLogger& mdlog, if (ddRole == DDRole::Master) { numDomains = optimizeDDCells(mdlog, numRanksRequested, numPmeOnlyRanks, cellSizeLimit, - ddSettings.request1D, mtop, box, *ddbox, ir, systemInfo); + mtop, box, *ddbox, ir, systemInfo); } } diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h index f247c4bb8c..db67680fb1 100644 --- a/src/gromacs/domdec/domdec_struct.h +++ b/src/gromacs/domdec/domdec_struct.h @@ -236,8 +236,8 @@ struct gmx_domdec_t /* gmx_pme_recv_f buffer */ std::vector pmeForceReceiveBuffer; - /* GPU halo exchange object */ - std::vector> gpuHaloExchange; + /* GPU halo exchange objects: this structure supports a vector of pulses for each dimension */ + std::vector> gpuHaloExchange[DIM]; }; //! Are we the master node for domain decomposition diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index 9482009410..6f4586098e 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -82,6 +82,7 @@ public: * does not yet support virial steps. * * \param [inout] dd domdec structure + * \param [in] dimIndex the dimension index for this instance * \param [in] mpi_comm_mysim communicator used for simulation * \param [in] deviceContext GPU device context * \param [in] streamLocal local NB CUDA stream. @@ -90,6 +91,7 @@ public: * \param [in] wcycle The wallclock counter */ GpuHaloExchange(gmx_domdec_t* dd, + int dimIndex, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, const DeviceStream& streamLocal, diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp index eaffb5d371..64221f0257 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -62,6 +62,7 @@ class GpuHaloExchange::Impl /*!\brief Constructor stub. */ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */, + int /* dimIndex */, MPI_Comm /* mpi_comm_mysim */, const DeviceContext& /* deviceContext */, const DeviceStream& /*streamLocal */, diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index cbcff3defb..d519445f74 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -136,10 +136,30 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo d_x_ = d_coordinatesBuffer; d_f_ = d_forcesBuffer; - const gmx_domdec_comm_t& comm = *dd_->comm; - const gmx_domdec_comm_dim_t& cd = comm.cd[0]; - const gmx_domdec_ind_t& ind = cd.ind[pulse_]; - int newSize = ind.nsend[nzone_ + 1]; + const gmx_domdec_comm_t& comm = *dd_->comm; + const gmx_domdec_comm_dim_t& cd = comm.cd[dimIndex_]; + const gmx_domdec_ind_t& ind = cd.ind[pulse_]; + + numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); // offset for data recieved by this rank + + // Determine receive offset for the dimension index and pulse of this halo exchange object + int numZoneTemp = 1; + int numZone = 0; + int numAtomsTotal = numHomeAtoms_; + for (int i = 0; i <= dimIndex_; i++) + { + int pulseMax = (i == dimIndex_) ? pulse_ : (comm.cd[i].numPulses() - 1); + for (int p = 0; p <= pulseMax; p++) + { + atomOffset_ = numAtomsTotal; + const gmx_domdec_ind_t& indTemp = comm.cd[i].ind[p]; + numAtomsTotal += indTemp.nrecv[numZoneTemp + 1]; + } + numZone = numZoneTemp; + numZoneTemp += numZoneTemp; + } + + int newSize = ind.nsend[numZone + 1]; GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange"); @@ -162,28 +182,24 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo fSendSize_ = xRecvSize_; fRecvSize_ = xSendSize_; - numHomeAtoms_ = comm.atomRanges.numHomeAtoms(); // offset for data recieved by this rank - - GMX_ASSERT(ind.index.size() == h_indexMap_.size(), "Size mismatch"); - std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin()); - - copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, - GpuApiCallBehavior::Async, nullptr); + if (newSize > 0) + { + GMX_ASSERT(ind.index.size() == h_indexMap_.size(), + "Size mismatch between domain decomposition communication index array and GPU " + "halo exchange index mapping array"); + std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin()); + copyToDeviceBuffer(&d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, + GpuApiCallBehavior::Async, nullptr); + } // This rank will push data to its neighbor, so needs to know // the remote receive address and similarly send its receive // address to other neighbour. We can do this here in reinit fn // since the pointers will not change until the next NS step. // Coordinates buffer: + void* recvPtr = static_cast(&d_x_[atomOffset_]); #if GMX_MPI - int pulseOffset = 0; - for (int p = pulse_ - 1; p >= 0; p--) - { - pulseOffset += cd.ind[p].nrecv[nzone_ + 1]; - } - // void* recvPtr = static_cast(&d_coordinatesBuffer[numHomeAtoms_ + pulseOffset]); - void* recvPtr = static_cast(&d_x_[numHomeAtoms_ + pulseOffset]); MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE); @@ -228,12 +244,9 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box // performing pressure coupling. So, for simplicity, the box // is used every step to pass the shift vector as an argument of // the packing kernel. - // - // Because only one-dimensional DD is supported, the coordinate - // shift only needs to handle that dimension. - const int dimensionIndex = dd_->dim[0]; - const float3 coordinateShift{ box[dimensionIndex][XX], box[dimensionIndex][YY], - box[dimensionIndex][ZZ] }; + const int boxDimensionIndex = dd_->dim[dimIndex_]; + const float3 coordinateShift{ box[boxDimensionIndex][XX], box[boxDimensionIndex][YY], + box[boxDimensionIndex][ZZ] }; // Avoid launching kernel when there is no work to do if (size > 0) @@ -278,8 +291,11 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF); float3* d_f = d_f_; - - if (pulse_ == (dd_->comm->cd[0].numPulses() - 1)) + // If this is the last pulse and index (noting the force halo + // exchanges across multiple pulses and indices are called in + // reverse order) then perform the following preparation + // activities + if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1))) { if (!accumulateForces) { @@ -311,11 +327,11 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) const int* indexMap = d_indexMap_; const int size = fRecvSize_; - if (pulse_ > 0) + if (pulse_ > 0 || dd_->ndim > 1) { // We need to accumulate rather than set, since it is possible - // that, in this pulse, a value could be written to a location - // corresponding to the halo region of a following pulse. + // that, in this pulse/dim, a value could be written to a location + // corresponding to the halo region of a following pulse/dim. accumulateForces = true; } @@ -374,12 +390,7 @@ void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr, } else { - int recvOffset = dd_->comm->atomRanges.end(DDAtomRanges::Type::Zones); - for (int p = pulse_; p < dd_->comm->cd[0].numPulses(); p++) - { - recvOffset -= dd_->comm->cd[0].ind[p].nrecv[nzone_ + 1]; - } - sendPtr = static_cast(&(d_ptr[recvOffset])); + sendPtr = static_cast(&(d_ptr[atomOffset_])); sendSize = fSendSize_; remotePtr = remoteFPtr_; sendRank = sendRankF_; @@ -440,6 +451,7 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent() /*! \brief Create Domdec GPU object */ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, + int dimIndex, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, const DeviceStream& localStream, @@ -447,11 +459,12 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, int pulse, gmx_wallcycle* wcycle) : dd_(dd), - sendRankX_(dd->neighbor[0][1]), - recvRankX_(dd->neighbor[0][0]), - sendRankF_(dd->neighbor[0][0]), - recvRankF_(dd->neighbor[0][1]), - usePBC_(dd->ci[dd->dim[0]] == 0), + dimIndex_(dimIndex), + sendRankX_(dd->neighbor[dimIndex][1]), + recvRankX_(dd->neighbor[dimIndex][0]), + sendRankF_(dd->neighbor[dimIndex][0]), + recvRankF_(dd->neighbor[dimIndex][1]), + usePBC_(dd->ci[dd->dim[dimIndex]] == 0), haloDataTransferLaunched_(new GpuEventSynchronizer()), mpi_comm_mysim_(mpi_comm_mysim), deviceContext_(deviceContext), @@ -464,11 +477,6 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled"); - if (dd->ndim > 1) - { - gmx_fatal(FARGS, "Error: dd->ndim > 1 is not yet supported in GPU halo exchange"); - } - if (usePBC_ && dd->unitCellInfo.haveScrewPBC) { gmx_fatal(FARGS, "Error: screw is not yet supported in GPU halo exchange\n"); @@ -489,13 +497,14 @@ GpuHaloExchange::Impl::~Impl() } GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, + int dimIndex, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, const DeviceStream& localStream, const DeviceStream& nonLocalStream, int pulse, gmx_wallcycle* wcycle) : - impl_(new Impl(dd, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle)) + impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle)) { } diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index 77aaf3f9bf..761938a013 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -72,6 +72,7 @@ public: /*! \brief Creates GPU Halo Exchange object. * * \param [inout] dd domdec structure + * \param [in] dimIndex the dimension index for this instance * \param [in] mpi_comm_mysim communicator used for simulation * \param [in] deviceContext GPU device context * \param [in] localStream local NB CUDA stream @@ -80,6 +81,7 @@ public: * \param [in] wcycle The wallclock counter */ Impl(gmx_domdec_t* dd, + int dimIndex, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, const DeviceStream& localStream, @@ -198,12 +200,16 @@ private: float3* d_f_ = nullptr; //! An event recorded once the exchanged forces are ready on the GPU GpuEventSynchronizer fReadyOnDevice_; + //! The dimension index corresponding to this halo exchange instance + int dimIndex_ = 0; //! The pulse corresponding to this halo exchange instance int pulse_ = 0; //! Number of zones. Always 1 for 1-D case. const int nzone_ = 1; //! The wallclock counter gmx_wallcycle* wcycle_ = nullptr; + //! The atom offset for receive (x) or send (f) for dimension index and pulse corresponding to this halo exchange instance + int atomOffset_ = 0; }; } // namespace gmx diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 11da2a775d..f8befd3ab7 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1150,7 +1150,7 @@ void do_force(FILE* fplog, // operations were checked before construction, so here we can // just use it and assert upon any conditions. const bool ddUsesGpuDirectCommunication = - ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty())); + ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange[0].empty())); GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps, "Must use coordinate buffer ops with GPU halo exchange"); const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps; @@ -1886,7 +1886,7 @@ void do_force(FILE* fplog, } if (useGpuForcesHaloExchange) { - dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent()); + dependencyList.push_back(cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent()); } nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr, dependencyList, stepWork.useGpuPmeFReduction, diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 9edb2f4c55..4d306410e5 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -849,13 +849,11 @@ void gmx::LegacySimulator::do_md() } // Allocate or re-size GPU halo exchange object, if necessary - if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange - && useGpuForNonbonded && is1D(*cr->dd)) + if (bNS && havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange && useGpuForNonbonded) { GMX_RELEASE_ASSERT(fr->deviceStreamManager != nullptr, "GPU device manager has to be initialized to use GPU " "version of halo exchange."); - // TODO remove need to pass local stream into GPU halo exchange - Issue #3093 constructGpuHaloExchange(mdlog, *cr, *fr->deviceStreamManager, wcycle); } diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 301a02ffe2..e774622f2d 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -1157,7 +1157,6 @@ int Mdrunner::mdrunner() useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes), *hwinfo->cpuInfo); - const bool prefer1DAnd1PulseDD = (devFlags.enableGpuHaloExchange && useGpuForNonbonded); // This builder is necessary while we have multi-part construction // of DD. Before DD is constructed, we use the existence of // the builder object to indicate that further construction of DD @@ -1166,7 +1165,7 @@ int Mdrunner::mdrunner() if (useDomainDecomposition) { ddBuilder = std::make_unique( - mdlog, cr, domdecOptions, mdrunOptions, prefer1DAnd1PulseDD, mtop, *inputrec, box, + mdlog, cr, domdecOptions, mdrunOptions, mtop, *inputrec, box, positionsFromStatePointer(globalState.get())); } else