From: Alan Gray Date: Fri, 6 Dec 2019 13:19:43 +0000 (-0800) Subject: Multiple pulses for GPU Halo Exchange X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=fdf8c9064e2289bb01b1536a06f83755450caee8;p=alexxy%2Fgromacs.git Multiple pulses for GPU Halo Exchange Removes restriction on single pulse. Implements #3106 Change-Id: I5d68258de831d04c14d6c352fc52e51852fccd80 --- diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index c625bdf7f9..13b7a33e2b 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -2949,17 +2949,13 @@ static bool canMake1DAnd1PulseDomainDecomposition(const DDSettings& return canMakeDDWith1DAnd1Pulse; } -bool is1DAnd1PulseDD(const gmx_domdec_t& dd) +bool is1D(const gmx_domdec_t& dd) { const int maxDimensionSize = std::max(dd.numCells[XX], std::max(dd.numCells[YY], dd.numCells[ZZ])); const int productOfDimensionSizes = dd.numCells[XX] * dd.numCells[YY] * dd.numCells[ZZ]; const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes); - const bool hasMax1Pulse = - ((isDlbDisabled(dd.comm) && dd.comm->cellsize_limit >= dd.comm->systemInfo.cutoff) - || (!isDlbDisabled(dd.comm) && dd.comm->maxpulse == 1)); - - return decompositionHasOneDimension && hasMax1Pulse; + return decompositionHasOneDimension; } namespace gmx @@ -3216,3 +3212,60 @@ gmx_bool change_dd_cutoff(t_commrec* cr, const matrix box, gmx::ArrayRefgpuHaloExchange.empty()) + { + GMX_LOG(mdlog.warning) + .asParagraph() + .appendTextFormatted( + "NOTE: Activating the 'GPU halo exchange' feature, enabled " + "by the " + "GMX_GPU_DD_COMMS environment variable."); + } + else + { + gpuHaloExchangeSize = static_cast(cr.dd->gpuHaloExchange.size()); + pulseStart = gpuHaloExchangeSize - 1; + } + if (cr.dd->comm->cd[0].numPulses() > gpuHaloExchangeSize) + { + for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++) + { + cr.dd->gpuHaloExchange.push_back(std::make_unique( + cr.dd, cr.mpi_comm_mysim, streamLocal, streamNonLocal, pulse)); + } + } +} + +void reinitGpuHaloExchange(const t_commrec& cr, + const DeviceBuffer d_coordinatesBuffer, + const DeviceBuffer d_forcesBuffer) +{ + for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++) + { + cr.dd->gpuHaloExchange[pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer); + } +} + +void communicateGpuHaloCoordinates(const t_commrec& cr, + const matrix box, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +{ + for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++) + { + cr.dd->gpuHaloExchange[pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent); + } +} + +void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces) +{ + for (int pulse = cr.dd->comm->cd[0].numPulses() - 1; pulse >= 0; pulse--) + { + cr.dd->gpuHaloExchange[pulse]->communicateHaloForces(accumulateForces); + } +} diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h index 439e86b5e7..1f3fad4405 100644 --- a/src/gromacs/domdec/domdec.h +++ b/src/gromacs/domdec/domdec.h @@ -62,6 +62,7 @@ #include +#include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/math/vectypes.h" #include "gromacs/utility/arrayref.h" #include "gromacs/utility/basedefinitions.h" @@ -84,6 +85,7 @@ struct t_nrnb; struct gmx_wallcycle; enum class PbcType : int; class t_state; +class GpuEventSynchronizer; namespace gmx { @@ -152,12 +154,12 @@ bool ddHaveSplitConstraints(const gmx_domdec_t& dd); /*! \brief Return whether update groups are used */ bool ddUsesUpdateGroups(const gmx_domdec_t& dd); -/*! \brief Return whether the DD has a single dimension with a single pulse +/*! \brief Return whether the DD has a single dimension * - * The GPU halo exchange code requires a 1D single-pulse DD, and its - * setup code can use the returned value to understand what it should - * do. */ -bool is1DAnd1PulseDD(const gmx_domdec_t& dd); + * The GPU halo exchange code requires a 1D DD, and its setup code can + * use the returned value to understand what it should do. + */ +bool is1D(const gmx_domdec_t& dd); /*! \brief Initialize data structures for bonded interactions */ void dd_init_bondeds(FILE* fplog, @@ -316,4 +318,39 @@ void dd_bonded_cg_distance(const gmx::MDLogger& mdlog, real* r_2b, real* r_mb); +/*! \brief Construct the GPU halo exchange object(s) + * \param[in] mdlog The logger object + * \param[in] cr The commrec object + * \param[in] streamLocal The local GPU stream + * \param[in] streamNonLocal The non-local GPU stream + */ +void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal); + +/*! \brief + * (Re-) Initialization for GPU halo exchange + * \param [in] cr The commrec object + * \param [in] d_coordinatesBuffer pointer to coordinates buffer in GPU memory + * \param [in] d_forcesBuffer pointer to forces buffer in GPU memory + */ +void reinitGpuHaloExchange(const t_commrec& cr, + DeviceBuffer d_coordinatesBuffer, + DeviceBuffer d_forcesBuffer); + + +/*! \brief GPU halo exchange of coordinates buffer. + * \param [in] cr The commrec object + * \param [in] box Coordinate box (from which shifts will be constructed) + * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device + */ +void communicateGpuHaloCoordinates(const t_commrec& cr, + const matrix box, + GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + + +/*! \brief GPU halo exchange of force buffer. + * \param [in] cr The commrec object + * \param [in] accumulateForces True if forces should accumulate, otherwise they are set + */ +void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces); + #endif diff --git a/src/gromacs/domdec/domdec_struct.h b/src/gromacs/domdec/domdec_struct.h index 12a134c96c..8670ca81c6 100644 --- a/src/gromacs/domdec/domdec_struct.h +++ b/src/gromacs/domdec/domdec_struct.h @@ -3,7 +3,7 @@ * * Copyright (c) 1991-2000, University of Groningen, The Netherlands. * Copyright (c) 2001-2004, The GROMACS development team. - * Copyright (c) 2013,2014,2015,2018,2019, by the GROMACS development team, led by + * Copyright (c) 2013,2014,2015,2018,2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -237,7 +237,7 @@ struct gmx_domdec_t std::vector pmeForceReceiveBuffer; /* GPU halo exchange object */ - std::unique_ptr gpuHaloExchange; + std::vector> gpuHaloExchange; }; //! Are we the master node for domain decomposition diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index d32b1800c4..dc65cb93d3 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -82,8 +82,9 @@ public: * \param [in] mpi_comm_mysim communicator used for simulation * \param [in] streamLocal local NB CUDA stream. * \param [in] streamNonLocal non-local NB CUDA stream. + * \param [in] pulse the communication pulse for this instance */ - GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal); + GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal, int pulse); ~GpuHaloExchange(); /*! \brief diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp index a17c550c6c..1ce9a9d93e 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -63,7 +63,8 @@ class GpuHaloExchange::Impl GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */, MPI_Comm /* mpi_comm_mysim */, void* /*streamLocal */, - void* /*streamNonLocal */) : + void* /*streamNonLocal */, + int /*pulse */) : impl_(nullptr) { GMX_ASSERT(false, diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 4313ffacb0..9c5aa5b8fe 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -134,14 +134,11 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo d_f_ = d_forcesBuffer; cudaStream_t stream = nonLocalStream_; - int nzone = 1; const gmx_domdec_comm_t& comm = *dd_->comm; const gmx_domdec_comm_dim_t& cd = comm.cd[0]; - const gmx_domdec_ind_t& ind = cd.ind[0]; - int newSize = ind.nsend[nzone + 1]; + const gmx_domdec_ind_t& ind = cd.ind[pulse_]; + int newSize = ind.nsend[nzone_ + 1]; - GMX_RELEASE_ASSERT(cd.numPulses() == 1, - "Multiple pulses are not yet supported in GPU halo exchange"); GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange"); // reallocates only if needed @@ -178,7 +175,13 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo // Coordinates buffer: #if GMX_MPI - void* recvPtr = static_cast(&d_coordinatesBuffer[numHomeAtoms_]); + int pulseOffset = 0; + for (int p = pulse_ - 1; p >= 0; p--) + { + pulseOffset += cd.ind[p].nrecv[nzone_ + 1]; + } + // void* recvPtr = static_cast(&d_coordinatesBuffer[numHomeAtoms_ + pulseOffset]); + void* recvPtr = static_cast(&d_x_[numHomeAtoms_ + pulseOffset]); MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, &remoteXPtr_, sizeof(void*), MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE); @@ -188,7 +191,6 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo MPI_BYTE, sendRankF_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE); #endif - return; } @@ -196,8 +198,11 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) { - // ensure stream waits until coordinate data is available on device - coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_); + if (pulse_ == 0) + { + // ensure stream waits until coordinate data is available on device + coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_); + } // launch kernel to pack send buffer KernelLaunchConfig config; @@ -252,19 +257,22 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) float3* d_f = d_f_; - if (!accumulateForces) + if (pulse_ == (dd_->comm->cd[0].numPulses() - 1)) { - // Clear local portion of force array (in local stream) - cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_); - } + if (!accumulateForces) + { + // Clear local portion of force array (in local stream) + cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_); + } - // ensure non-local stream waits for local stream, due to dependence on - // the previous H2D copy of CPU forces (if accumulateForces is true) - // or the above clearing. - // TODO remove this dependency on localStream - edmine issue #3093 - GpuEventSynchronizer eventLocal; - eventLocal.markEvent(localStream_); - eventLocal.enqueueWaitEvent(nonLocalStream_); + // ensure non-local stream waits for local stream, due to dependence on + // the previous H2D copy of CPU forces (if accumulateForces is true) + // or the above clearing. + // TODO remove this dependency on localStream - edmine issue #3093 + GpuEventSynchronizer eventLocal; + eventLocal.markEvent(localStream_); + eventLocal.enqueueWaitEvent(nonLocalStream_); + } // Unpack halo buffer into force array @@ -282,6 +290,14 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) const int* indexMap = d_indexMap_; const int size = fRecvSize_; + if (pulse_ > 0) + { + // We need to accumulate rather than set, since it is possible + // that, in this pulse, a value could be written to a location + // corresponding to the halo region of a following pulse. + accumulateForces = true; + } + if (size > 0) { auto kernelFn = accumulateForces ? unpackRecvBufKernel : unpackRecvBufKernel; @@ -291,7 +307,11 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs); } - fReadyOnDevice_.markEvent(nonLocalStream_); + + if (pulse_ == 0) + { + fReadyOnDevice_.markEvent(nonLocalStream_); + } } @@ -329,7 +349,12 @@ void GpuHaloExchange::Impl::communicateHaloData(float3* d_ptr, } else { - sendPtr = static_cast(&(d_ptr[numHomeAtoms_])); + int recvOffset = dd_->comm->atomRanges.end(DDAtomRanges::Type::Zones); + for (int p = pulse_; p < dd_->comm->cd[0].numPulses(); p++) + { + recvOffset -= dd_->comm->cd[0].ind[p].nrecv[nzone_ + 1]; + } + sendPtr = static_cast(&(d_ptr[recvOffset])); sendSize = fSendSize_; remotePtr = remoteFPtr_; sendRank = sendRankF_; @@ -389,7 +414,11 @@ GpuEventSynchronizer* GpuHaloExchange::Impl::getForcesReadyOnDeviceEvent() } /*! \brief Create Domdec GPU object */ -GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) : +GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, + MPI_Comm mpi_comm_mysim, + void* localStream, + void* nonLocalStream, + int pulse) : dd_(dd), sendRankX_(dd->neighbor[0][1]), recvRankX_(dd->neighbor[0][0]), @@ -399,7 +428,8 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* loc haloDataTransferLaunched_(new GpuEventSynchronizer()), mpi_comm_mysim_(mpi_comm_mysim), localStream_(*static_cast(localStream)), - nonLocalStream_(*static_cast(nonLocalStream)) + nonLocalStream_(*static_cast(nonLocalStream)), + pulse_(pulse) { GMX_RELEASE_ASSERT(GMX_THREAD_MPI, @@ -429,8 +459,12 @@ GpuHaloExchange::Impl::~Impl() delete haloDataTransferLaunched_; } -GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) : - impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream)) +GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, + MPI_Comm mpi_comm_mysim, + void* localStream, + void* nonLocalStream, + int pulse) : + impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, pulse)) { } diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index 017cb19186..b139a9b491 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019, by the GROMACS development team, led by + * Copyright (c) 2019,2020, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -72,8 +72,9 @@ public: * \param [in] mpi_comm_mysim communicator used for simulation * \param [in] localStream local NB CUDA stream * \param [in] nonLocalStream non-local NB CUDA stream + * \param [in] pulse the communication pulse for this instance */ - Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream); + Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream, int pulse); ~Impl(); /*! \brief @@ -184,6 +185,10 @@ private: float3* d_f_ = nullptr; //! An event recorded once the exchanged forces are ready on the GPU GpuEventSynchronizer fReadyOnDevice_; + //! The pulse corresponding to this halo exchange instance + int pulse_ = 0; + //! Number of zones. Always 1 for 1-D case. + const int nzone_ = 1; }; } // namespace gmx diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 07644556aa..61878b8f5a 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1001,6 +1001,7 @@ void do_force(FILE* fplog, { if (stepWork.doNeighborSearch) { + // TODO refactor this to do_md, after partitioning. stateGpu->reinit(mdatoms->homenr, cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr); if (useGpuPmeOnThisRank) @@ -1023,9 +1024,8 @@ void do_force(FILE* fplog, // The conditions for gpuHaloExchange e.g. using GPU buffer // operations were checked before construction, so here we can // just use it and assert upon any conditions. - gmx::GpuHaloExchange* gpuHaloExchange = - (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr); - const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr); + const bool ddUsesGpuDirectCommunication = + ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty())); GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps, "Must use coordinate buffer ops with GPU halo exchange"); const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps; @@ -1259,9 +1259,13 @@ void do_force(FILE* fplog, nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal); wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL); wallcycle_stop(wcycle, ewcNS); + // TODO refactor this GPU halo exchange re-initialisation + // to location in do_md where GPU halo exchange is + // constructed at partitioning, after above stateGpu + // re-initialization has similarly been refactored if (ddUsesGpuDirectCommunication) { - gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces()); + reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces()); } } else @@ -1270,7 +1274,7 @@ void do_force(FILE* fplog, { // The following must be called after local setCoordinates (which records an event // when the coordinate data has been copied to the device). - gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice); + communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice); if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork) { @@ -1590,7 +1594,7 @@ void do_force(FILE* fplog, { stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local); } - gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork); + communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork); } else { @@ -1731,7 +1735,7 @@ void do_force(FILE* fplog, } if (useGpuForcesHaloExchange) { - dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent()); + dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent()); } nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr, dependencyList, stepWork.useGpuPmeFReduction, diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index b86fc12686..5314e3ac9c 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -58,6 +58,7 @@ #include "gromacs/domdec/domdec.h" #include "gromacs/domdec/domdec_network.h" #include "gromacs/domdec/domdec_struct.h" +#include "gromacs/domdec/gpuhaloexchange.h" #include "gromacs/domdec/mdsetup.h" #include "gromacs/domdec/partition.h" #include "gromacs/essentialdynamics/edsam.h" @@ -843,6 +844,18 @@ void gmx::LegacySimulator::do_md() fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting); shouldCheckNumberOfBondedInteractions = true; upd.setNumAtoms(state->natoms); + + // Allocate or re-size GPU halo exchange object, if necessary + if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange + && useGpuForNonbonded && is1D(*cr->dd)) + { + // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093 + void* streamLocal = + Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local); + void* streamNonLocal = Nbnxm::gpu_get_command_stream( + fr->nbv->gpu_nbv, InteractionLocality::NonLocal); + constructGpuHaloExchange(mdlog, *cr, streamLocal, streamNonLocal); + } } } diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp index 3debb2046e..41f609d1cc 100644 --- a/src/gromacs/mdrun/runner.cpp +++ b/src/gromacs/mdrun/runner.cpp @@ -254,7 +254,8 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md GMX_LOG(mdlog.warning) .asParagraph() .appendTextFormatted( - "This run uses the 'GPU halo exchange' feature, enabled by the " + "This run has requested the 'GPU halo exchange' feature, enabled by " + "the " "GMX_GPU_DD_COMMS environment variable."); } else @@ -1358,26 +1359,6 @@ int Mdrunner::mdrunner() fr->gpuBonded = gpuBonded.get(); } - // TODO Move this to happen during domain decomposition setup, - // once stream and event handling works well with that. - // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093 - if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd)) - { - GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, - "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1"); - void* streamLocal = - Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local); - void* streamNonLocal = - Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal); - GMX_LOG(mdlog.warning) - .asParagraph() - .appendTextFormatted( - "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the " - "GMX_GPU_DD_COMMS environment variable."); - cr->dd->gpuHaloExchange = std::make_unique( - cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal); - } - /* Initialize the mdAtoms structure. * mdAtoms is not filled with atom data, * as this can not be done now with domain decomposition.