Removes restriction on single pulse.
Implements #3106
Change-Id: I5d68258de831d04c14d6c352fc52e51852fccd80
return canMakeDDWith1DAnd1Pulse;
}
-bool is1DAnd1PulseDD(const gmx_domdec_t& dd)
+bool is1D(const gmx_domdec_t& dd)
{
const int maxDimensionSize = std::max(dd.numCells[XX], std::max(dd.numCells[YY], dd.numCells[ZZ]));
const int productOfDimensionSizes = dd.numCells[XX] * dd.numCells[YY] * dd.numCells[ZZ];
const bool decompositionHasOneDimension = (maxDimensionSize == productOfDimensionSizes);
- const bool hasMax1Pulse =
- ((isDlbDisabled(dd.comm) && dd.comm->cellsize_limit >= dd.comm->systemInfo.cutoff)
- || (!isDlbDisabled(dd.comm) && dd.comm->maxpulse == 1));
-
- return decompositionHasOneDimension && hasMax1Pulse;
+ return decompositionHasOneDimension;
}
namespace gmx
return bCutoffAllowed;
}
+
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal)
+{
+
+ int gpuHaloExchangeSize = 0;
+ int pulseStart = 0;
+ if (cr.dd->gpuHaloExchange.empty())
+ {
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "NOTE: Activating the 'GPU halo exchange' feature, enabled "
+ "by the "
+ "GMX_GPU_DD_COMMS environment variable.");
+ }
+ else
+ {
+ gpuHaloExchangeSize = static_cast<int>(cr.dd->gpuHaloExchange.size());
+ pulseStart = gpuHaloExchangeSize - 1;
+ }
+ if (cr.dd->comm->cd[0].numPulses() > gpuHaloExchangeSize)
+ {
+ for (int pulse = pulseStart; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+ {
+ cr.dd->gpuHaloExchange.push_back(std::make_unique<gmx::GpuHaloExchange>(
+ cr.dd, cr.mpi_comm_mysim, streamLocal, streamNonLocal, pulse));
+ }
+ }
+}
+
+void reinitGpuHaloExchange(const t_commrec& cr,
+ const DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
+ const DeviceBuffer<gmx::RVec> d_forcesBuffer)
+{
+ for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+ {
+ cr.dd->gpuHaloExchange[pulse]->reinitHalo(d_coordinatesBuffer, d_forcesBuffer);
+ }
+}
+
+void communicateGpuHaloCoordinates(const t_commrec& cr,
+ const matrix box,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+ for (int pulse = 0; pulse < cr.dd->comm->cd[0].numPulses(); pulse++)
+ {
+ cr.dd->gpuHaloExchange[pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+ }
+}
+
+void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
+{
+ for (int pulse = cr.dd->comm->cd[0].numPulses() - 1; pulse >= 0; pulse--)
+ {
+ cr.dd->gpuHaloExchange[pulse]->communicateHaloForces(accumulateForces);
+ }
+}
#include <vector>
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/basedefinitions.h"
struct gmx_wallcycle;
enum class PbcType : int;
class t_state;
+class GpuEventSynchronizer;
namespace gmx
{
/*! \brief Return whether update groups are used */
bool ddUsesUpdateGroups(const gmx_domdec_t& dd);
-/*! \brief Return whether the DD has a single dimension with a single pulse
+/*! \brief Return whether the DD has a single dimension
*
- * The GPU halo exchange code requires a 1D single-pulse DD, and its
- * setup code can use the returned value to understand what it should
- * do. */
-bool is1DAnd1PulseDD(const gmx_domdec_t& dd);
+ * The GPU halo exchange code requires a 1D DD, and its setup code can
+ * use the returned value to understand what it should do.
+ */
+bool is1D(const gmx_domdec_t& dd);
/*! \brief Initialize data structures for bonded interactions */
void dd_init_bondeds(FILE* fplog,
real* r_2b,
real* r_mb);
+/*! \brief Construct the GPU halo exchange object(s)
+ * \param[in] mdlog The logger object
+ * \param[in] cr The commrec object
+ * \param[in] streamLocal The local GPU stream
+ * \param[in] streamNonLocal The non-local GPU stream
+ */
+void constructGpuHaloExchange(const gmx::MDLogger& mdlog, const t_commrec& cr, void* streamLocal, void* streamNonLocal);
+
+/*! \brief
+ * (Re-) Initialization for GPU halo exchange
+ * \param [in] cr The commrec object
+ * \param [in] d_coordinatesBuffer pointer to coordinates buffer in GPU memory
+ * \param [in] d_forcesBuffer pointer to forces buffer in GPU memory
+ */
+void reinitGpuHaloExchange(const t_commrec& cr,
+ DeviceBuffer<gmx::RVec> d_coordinatesBuffer,
+ DeviceBuffer<gmx::RVec> d_forcesBuffer);
+
+
+/*! \brief GPU halo exchange of coordinates buffer.
+ * \param [in] cr The commrec object
+ * \param [in] box Coordinate box (from which shifts will be constructed)
+ * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+ */
+void communicateGpuHaloCoordinates(const t_commrec& cr,
+ const matrix box,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+
+/*! \brief GPU halo exchange of force buffer.
+ * \param [in] cr The commrec object
+ * \param [in] accumulateForces True if forces should accumulate, otherwise they are set
+ */
+void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces);
+
#endif
*
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2018,2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
std::vector<gmx::RVec> pmeForceReceiveBuffer;
/* GPU halo exchange object */
- std::unique_ptr<gmx::GpuHaloExchange> gpuHaloExchange;
+ std::vector<std::unique_ptr<gmx::GpuHaloExchange>> gpuHaloExchange;
};
//! Are we the master node for domain decomposition
* \param [in] mpi_comm_mysim communicator used for simulation
* \param [in] streamLocal local NB CUDA stream.
* \param [in] streamNonLocal non-local NB CUDA stream.
+ * \param [in] pulse the communication pulse for this instance
*/
- GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal);
+ GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* streamLocal, void* streamNonLocal, int pulse);
~GpuHaloExchange();
/*! \brief
GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */,
MPI_Comm /* mpi_comm_mysim */,
void* /*streamLocal */,
- void* /*streamNonLocal */) :
+ void* /*streamNonLocal */,
+ int /*pulse */) :
impl_(nullptr)
{
GMX_ASSERT(false,
d_f_ = d_forcesBuffer;
cudaStream_t stream = nonLocalStream_;
- int nzone = 1;
const gmx_domdec_comm_t& comm = *dd_->comm;
const gmx_domdec_comm_dim_t& cd = comm.cd[0];
- const gmx_domdec_ind_t& ind = cd.ind[0];
- int newSize = ind.nsend[nzone + 1];
+ const gmx_domdec_ind_t& ind = cd.ind[pulse_];
+ int newSize = ind.nsend[nzone_ + 1];
- GMX_RELEASE_ASSERT(cd.numPulses() == 1,
- "Multiple pulses are not yet supported in GPU halo exchange");
GMX_ASSERT(cd.receiveInPlace, "Out-of-place receive is not yet supported in GPU halo exchange");
// reallocates only if needed
// Coordinates buffer:
#if GMX_MPI
- void* recvPtr = static_cast<void*>(&d_coordinatesBuffer[numHomeAtoms_]);
+ int pulseOffset = 0;
+ for (int p = pulse_ - 1; p >= 0; p--)
+ {
+ pulseOffset += cd.ind[p].nrecv[nzone_ + 1];
+ }
+ // void* recvPtr = static_cast<void*>(&d_coordinatesBuffer[numHomeAtoms_ + pulseOffset]);
+ void* recvPtr = static_cast<void*>(&d_x_[numHomeAtoms_ + pulseOffset]);
MPI_Sendrecv(&recvPtr, sizeof(void*), MPI_BYTE, recvRankX_, 0, &remoteXPtr_, sizeof(void*),
MPI_BYTE, sendRankX_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
MPI_BYTE, sendRankF_, 0, mpi_comm_mysim_, MPI_STATUS_IGNORE);
#endif
-
return;
}
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
- // ensure stream waits until coordinate data is available on device
- coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+ if (pulse_ == 0)
+ {
+ // ensure stream waits until coordinate data is available on device
+ coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+ }
// launch kernel to pack send buffer
KernelLaunchConfig config;
float3* d_f = d_f_;
- if (!accumulateForces)
+ if (pulse_ == (dd_->comm->cd[0].numPulses() - 1))
{
- // Clear local portion of force array (in local stream)
- cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
- }
+ if (!accumulateForces)
+ {
+ // Clear local portion of force array (in local stream)
+ cudaMemsetAsync(d_f, 0, numHomeAtoms_ * sizeof(rvec), localStream_);
+ }
- // ensure non-local stream waits for local stream, due to dependence on
- // the previous H2D copy of CPU forces (if accumulateForces is true)
- // or the above clearing.
- // TODO remove this dependency on localStream - edmine issue #3093
- GpuEventSynchronizer eventLocal;
- eventLocal.markEvent(localStream_);
- eventLocal.enqueueWaitEvent(nonLocalStream_);
+ // ensure non-local stream waits for local stream, due to dependence on
+ // the previous H2D copy of CPU forces (if accumulateForces is true)
+ // or the above clearing.
+ // TODO remove this dependency on localStream - edmine issue #3093
+ GpuEventSynchronizer eventLocal;
+ eventLocal.markEvent(localStream_);
+ eventLocal.enqueueWaitEvent(nonLocalStream_);
+ }
// Unpack halo buffer into force array
const int* indexMap = d_indexMap_;
const int size = fRecvSize_;
+ if (pulse_ > 0)
+ {
+ // We need to accumulate rather than set, since it is possible
+ // that, in this pulse, a value could be written to a location
+ // corresponding to the halo region of a following pulse.
+ accumulateForces = true;
+ }
+
if (size > 0)
{
auto kernelFn = accumulateForces ? unpackRecvBufKernel<true> : unpackRecvBufKernel<false>;
launchGpuKernel(kernelFn, config, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
}
- fReadyOnDevice_.markEvent(nonLocalStream_);
+
+ if (pulse_ == 0)
+ {
+ fReadyOnDevice_.markEvent(nonLocalStream_);
+ }
}
}
else
{
- sendPtr = static_cast<void*>(&(d_ptr[numHomeAtoms_]));
+ int recvOffset = dd_->comm->atomRanges.end(DDAtomRanges::Type::Zones);
+ for (int p = pulse_; p < dd_->comm->cd[0].numPulses(); p++)
+ {
+ recvOffset -= dd_->comm->cd[0].ind[p].nrecv[nzone_ + 1];
+ }
+ sendPtr = static_cast<void*>(&(d_ptr[recvOffset]));
sendSize = fSendSize_;
remotePtr = remoteFPtr_;
sendRank = sendRankF_;
}
/*! \brief Create Domdec GPU object */
-GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
+GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd,
+ MPI_Comm mpi_comm_mysim,
+ void* localStream,
+ void* nonLocalStream,
+ int pulse) :
dd_(dd),
sendRankX_(dd->neighbor[0][1]),
recvRankX_(dd->neighbor[0][0]),
haloDataTransferLaunched_(new GpuEventSynchronizer()),
mpi_comm_mysim_(mpi_comm_mysim),
localStream_(*static_cast<cudaStream_t*>(localStream)),
- nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream))
+ nonLocalStream_(*static_cast<cudaStream_t*>(nonLocalStream)),
+ pulse_(pulse)
{
GMX_RELEASE_ASSERT(GMX_THREAD_MPI,
delete haloDataTransferLaunched_;
}
-GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream) :
- impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream))
+GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd,
+ MPI_Comm mpi_comm_mysim,
+ void* localStream,
+ void* nonLocalStream,
+ int pulse) :
+ impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, pulse))
{
}
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
* \param [in] mpi_comm_mysim communicator used for simulation
* \param [in] localStream local NB CUDA stream
* \param [in] nonLocalStream non-local NB CUDA stream
+ * \param [in] pulse the communication pulse for this instance
*/
- Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream);
+ Impl(gmx_domdec_t* dd, MPI_Comm mpi_comm_mysim, void* localStream, void* nonLocalStream, int pulse);
~Impl();
/*! \brief
float3* d_f_ = nullptr;
//! An event recorded once the exchanged forces are ready on the GPU
GpuEventSynchronizer fReadyOnDevice_;
+ //! The pulse corresponding to this halo exchange instance
+ int pulse_ = 0;
+ //! Number of zones. Always 1 for 1-D case.
+ const int nzone_ = 1;
};
} // namespace gmx
{
if (stepWork.doNeighborSearch)
{
+ // TODO refactor this to do_md, after partitioning.
stateGpu->reinit(mdatoms->homenr,
cr->dd != nullptr ? dd_numAtomsZones(*cr->dd) : mdatoms->homenr);
if (useGpuPmeOnThisRank)
// The conditions for gpuHaloExchange e.g. using GPU buffer
// operations were checked before construction, so here we can
// just use it and assert upon any conditions.
- gmx::GpuHaloExchange* gpuHaloExchange =
- (havePPDomainDecomposition(cr) ? cr->dd->gpuHaloExchange.get() : nullptr);
- const bool ddUsesGpuDirectCommunication = (gpuHaloExchange != nullptr);
+ const bool ddUsesGpuDirectCommunication =
+ ((cr->dd != nullptr) && (!cr->dd->gpuHaloExchange.empty()));
GMX_ASSERT(!ddUsesGpuDirectCommunication || stepWork.useGpuXBufferOps,
"Must use coordinate buffer ops with GPU halo exchange");
const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && stepWork.useGpuFBufferOps;
nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
wallcycle_stop(wcycle, ewcNS);
+ // TODO refactor this GPU halo exchange re-initialisation
+ // to location in do_md where GPU halo exchange is
+ // constructed at partitioning, after above stateGpu
+ // re-initialization has similarly been refactored
if (ddUsesGpuDirectCommunication)
{
- gpuHaloExchange->reinitHalo(stateGpu->getCoordinates(), stateGpu->getForces());
+ reinitGpuHaloExchange(*cr, stateGpu->getCoordinates(), stateGpu->getForces());
}
}
else
{
// The following must be called after local setCoordinates (which records an event
// when the coordinate data has been copied to the device).
- gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
+ communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
{
{
stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
}
- gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork);
+ communicateGpuHaloForces(*cr, domainWork.haveCpuLocalForceWork);
}
else
{
}
if (useGpuForcesHaloExchange)
{
- dependencyList.push_back(gpuHaloExchange->getForcesReadyOnDeviceEvent());
+ dependencyList.push_back(cr->dd->gpuHaloExchange[0]->getForcesReadyOnDeviceEvent());
}
nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
dependencyList, stepWork.useGpuPmeFReduction,
#include "gromacs/domdec/domdec.h"
#include "gromacs/domdec/domdec_network.h"
#include "gromacs/domdec/domdec_struct.h"
+#include "gromacs/domdec/gpuhaloexchange.h"
#include "gromacs/domdec/mdsetup.h"
#include "gromacs/domdec/partition.h"
#include "gromacs/essentialdynamics/edsam.h"
fr, vsite, constr, nrnb, wcycle, do_verbose && !bPMETunePrinting);
shouldCheckNumberOfBondedInteractions = true;
upd.setNumAtoms(state->natoms);
+
+ // Allocate or re-size GPU halo exchange object, if necessary
+ if (havePPDomainDecomposition(cr) && simulationWork.useGpuHaloExchange
+ && useGpuForNonbonded && is1D(*cr->dd))
+ {
+ // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
+ void* streamLocal =
+ Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
+ void* streamNonLocal = Nbnxm::gpu_get_command_stream(
+ fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
+ constructGpuHaloExchange(mdlog, *cr, streamLocal, streamNonLocal);
+ }
}
}
GMX_LOG(mdlog.warning)
.asParagraph()
.appendTextFormatted(
- "This run uses the 'GPU halo exchange' feature, enabled by the "
+ "This run has requested the 'GPU halo exchange' feature, enabled by "
+ "the "
"GMX_GPU_DD_COMMS environment variable.");
}
else
fr->gpuBonded = gpuBonded.get();
}
- // TODO Move this to happen during domain decomposition setup,
- // once stream and event handling works well with that.
- // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
- if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
- {
- GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps,
- "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
- void* streamLocal =
- Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
- void* streamNonLocal =
- Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
- GMX_LOG(mdlog.warning)
- .asParagraph()
- .appendTextFormatted(
- "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
- "GMX_GPU_DD_COMMS environment variable.");
- cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(
- cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal);
- }
-
/* Initialize the mdAtoms structure.
* mdAtoms is not filled with atom data,
* as this can not be done now with domain decomposition.