:issue:`2875`
:issue:`742`
+
+
+PME-PP GPU Direct Communication Pipelining
+""""""""""""""""""""""""""""""""""""""
+
+For multi-GPU runs with direct PME-PP GPU comunication enabled, the
+PME rank can now pipeline the coordinate transfers with computation in
+the PME Spread and Spline kernel (where the coordinates are
+consumed). The data from each transfer is handled seperately, allowing
+computation and communication to be overlapped. This is expected to
+have most benefit on systems where hardware communication interfaces
+are shared between multiple GPUs, e.g. PCIe within multi-GPU servers
+or Infiniband across multiple nodes.
+
+:issue:`3969`
+
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
std::vector<std::string> reasons_;
};
+class PmeCoordinateReceiverGpu;
} // namespace gmx
enum
/*! \brief
* Launches first stage of PME on GPU - spreading kernel.
*
- * \param[in] pme The PME data structure.
- * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates
- * are ready in the device memory; nullptr allowed only on separate PME ranks.
- * \param[in] wcycle The wallclock counter.
- * \param[in] lambdaQ The Coulomb lambda of the current state of the
- * system. Only used if FEP of Coulomb is active.
+ * \param[in] pme The PME data structure.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates
+ * are ready in the device memory; nullptr allowed only
+ * on separate PME ranks.
+ * \param[in] wcycle The wallclock counter.
+ * \param[in] lambdaQ The Coulomb lambda of the current state of the
+ * system. Only used if FEP of Coulomb is active.
+ * \param[in] useGpuDirectComm Whether direct GPU PME-PP communication is active
+ * \param[in] pmeCoordinateReceiverGpu Coordinate receiver object, which must be valid when
+ * direct GPU PME-PP communication is active
*/
-GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
- GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
- gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle),
- real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM;
+GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(
+ gmx_pme_t* GPU_FUNC_ARGUMENT(pme),
+ GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
+ gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle),
+ real GPU_FUNC_ARGUMENT(lambdaQ),
+ const bool GPU_FUNC_ARGUMENT(useGpuDirectComm),
+ gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
/*! \brief
* Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode.
#include "gromacs/utility/gmxmpi.h"
class DeviceStream;
+class DeviceContext;
+
struct PpRanks;
namespace gmx
public:
/*! \brief Creates PME GPU coordinate receiver object
- * \param[in] pmeStream CUDA stream used for PME computations
+ *
+ * For multi-GPU runs, the PME GPU can receive coordinates from
+ * multiple PP GPUs. Data from these distinct communications can
+ * be handled separately in the PME spline/spread kernel, allowing
+ * pipelining which overlaps computation and communication. The
+ * class methods are designed to called seperately for each remote
+ * PP rank, and internally a different stream is used for each
+ * remote PP rank to allow overlapping.
+ *
* \param[in] comm Communicator used for simulation
+ * \param[in] deviceContext GPU context
* \param[in] ppRanks List of PP ranks
*/
- PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+ PmeCoordinateReceiverGpu(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef<PpRanks> ppRanks);
~PmeCoordinateReceiverGpu();
/*! \brief
+ * Re-initialize: set atom ranges and, for thread-MPI case,
* send coordinates buffer address to PP rank
+ * This is required after repartitioning since atom ranges and
+ * buffer allocations may have changed.
* \param[in] d_x coordinates buffer in GPU memory
*/
- void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
+ void reinitCoordinateReceiver(DeviceBuffer<RVec> d_x);
/*! \brief
void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
/*! \brief
- * For lib MPI, wait for coordinates from PP ranks
- * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+ * For lib MPI, wait for coordinates from any PP rank
+ * For thread MPI, enqueue PP co-ordinate transfer event received from PP
+ * rank determined from pipeline stage into given stream
+ * \param[in] pipelineStage stage of pipeline corresponding to this transfer
+ * \param[in] deviceStream stream in which to enqueue the wait event.
+ * \returns rank of sending PP task
+ */
+ int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream);
+
+ /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks,
+ * enqueueing all events to a single stream
+ * \param[in] deviceStream stream in which to enqueue the wait events.
+ */
+ void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream);
+
+ /*! \brief
+ * Return pointer to stream associated with specific PP rank sender index
+ * \param[in] senderIndex Index of sender PP rank.
+ */
+ DeviceStream* ppCommStream(int senderIndex);
+
+ /*! \brief
+ * Returns range of atoms involved in communication associated with specific PP rank sender
+ * index \param[in] senderIndex Index of sender PP rank.
+ */
+ std::tuple<int, int> ppCommAtomRange(int senderIndex);
+
+ /*! \brief
+ * Return number of PP ranks involved in PME-PP communication
*/
- void synchronizeOnCoordinatesFromPpRanks();
+ int ppCommNumSenderRanks();
private:
class Impl;
};
/*!\brief Constructor stub. */
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStream */,
- MPI_Comm /* comm */,
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm /* comm */,
+ const DeviceContext& /* deviceContext */,
gmx::ArrayRef<PpRanks> /* ppRanks */) :
impl_(nullptr)
{
PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
/*!\brief init PME-PP GPU communication stub */
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> /* d_x */)
+void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer<RVec> /* d_x */)
{
GMX_ASSERT(!impl_,
"A CPU stub for PME-PP GPU communication initialization was called instead of the "
"implementation.");
}
-void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int /* pipelineStage */,
+ const DeviceStream& /* deviceStream */)
{
GMX_ASSERT(!impl_,
"A CPU stub for PME-PP GPU communication was called instead of the correct "
"implementation.");
+ return 0;
}
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& /* deviceStream */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub for PME-PP GPU communication was called instead of the correct "
+ "implementation.");
+}
+
+DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int /* senderIndex */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub for PME-PP GPU communication was called instead of the correct "
+ "implementation.");
+ return nullptr;
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::ppCommAtomRange(int /* senderIndex */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub for PME-PP GPU communication was called instead of the correct "
+ "implementation.");
+ return std::make_tuple(0, 0);
+}
+
+int PmeCoordinateReceiverGpu::ppCommNumSenderRanks()
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub for PME-PP GPU communication was called instead of the correct "
+ "implementation.");
+ return 0;
+}
+
+
} // namespace gmx
#endif // !GMX_GPU_CUDA
namespace gmx
{
-PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream& pmeStream,
- MPI_Comm comm,
- gmx::ArrayRef<PpRanks> ppRanks) :
- pmeStream_(pmeStream), comm_(comm), ppRanks_(ppRanks)
+PmeCoordinateReceiverGpu::Impl::Impl(MPI_Comm comm,
+ const DeviceContext& deviceContext,
+ gmx::ArrayRef<const PpRanks> ppRanks) :
+ comm_(comm), requests_(ppRanks.size(), MPI_REQUEST_NULL), deviceContext_(deviceContext)
{
- request_.resize(ppRanks.size());
- ppSync_.resize(ppRanks.size());
+ // Create streams to manage pipelining
+ ppCommManagers_.reserve(ppRanks.size());
+ for (auto& ppRank : ppRanks)
+ {
+ ppCommManagers_.emplace_back(PpCommManager{
+ ppRank,
+ std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::High, false),
+ nullptr,
+ { 0, 0 } });
+ }
}
PmeCoordinateReceiverGpu::Impl::~Impl() = default;
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
+void PmeCoordinateReceiverGpu::Impl::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
{
- // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
- if (GMX_THREAD_MPI)
+ int indEnd = 0;
+ for (auto& ppCommManager : ppCommManagers_)
{
- int ind_start = 0;
- int ind_end = 0;
- for (const auto& receiver : ppRanks_)
- {
- ind_start = ind_end;
- ind_end = ind_start + receiver.numAtoms;
+ int indStart = indEnd;
+ indEnd = indStart + ppCommManager.ppRank.numAtoms;
+ ppCommManager.atomRange = std::make_tuple(indStart, indEnd);
+
+ // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
+ if (GMX_THREAD_MPI)
+ {
// Data will be transferred directly from GPU.
- void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
+ void* sendBuf = reinterpret_cast<void*>(&d_x[indStart]);
#if GMX_MPI
- MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+ MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, ppCommManager.ppRank.rankId, 0, comm_);
#else
GMX_UNUSED_VALUE(sendBuf);
#endif
#if GMX_MPI
// Receive event from PP task
// NOLINTNEXTLINE(bugprone-sizeof-expression)
- MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, &request_[recvCount_]);
- recvCount_++;
+ MPI_Irecv(&ppCommManagers_[ppRank].sync,
+ sizeof(GpuEventSynchronizer*),
+ MPI_BYTE,
+ ppRank,
+ 0,
+ comm_,
+ &(requests_[ppRank]));
#else
GMX_UNUSED_VALUE(ppRank);
#endif
"launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
#if GMX_MPI
- MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &request_[recvCount_++]);
+ MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &(requests_[ppRank]));
#else
GMX_UNUSED_VALUE(recvbuf);
GMX_UNUSED_VALUE(numAtoms);
#endif
}
-void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRank(int pipelineStage,
+ const DeviceStream& deviceStream)
{
- if (recvCount_ > 0)
- {
- // ensure PME calculation doesn't commence until coordinate data/remote events
- // has been transferred
#if GMX_MPI
- MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE);
+ int senderRank = -1; // Rank of PP task that is associated with this invocation.
+# if (!GMX_THREAD_MPI)
+ // Wait on data from any one of the PP sender GPUs
+ MPI_Waitany(requests_.size(), requests_.data(), &senderRank, MPI_STATUS_IGNORE);
+ GMX_ASSERT(senderRank >= 0, "Rank of sending PP task must be 0 or greater");
+ GMX_UNUSED_VALUE(pipelineStage);
+ GMX_UNUSED_VALUE(deviceStream);
+# else
+ // MPI_Waitany is not available in thread-MPI. However, the
+ // MPI_Wait here is not associated with data but is host-side
+ // scheduling code to receive a CUDA event, and will be executed
+ // in advance of the actual data transfer. Therefore we can
+ // receive in order of pipeline stage, still allowing the
+ // scheduled GPU-direct comms to initiate out-of-order in their
+ // respective streams. For cases with CPU force computations, the
+ // scheduling is less asynchronous (done on a per-step basis), so
+ // host-side improvements should be investigated as tracked in
+ // issue #4047
+ senderRank = pipelineStage;
+ MPI_Wait(&(requests_[senderRank]), MPI_STATUS_IGNORE);
+ ppCommManagers_[senderRank].sync->enqueueWaitEvent(deviceStream);
+# endif
+ return senderRank;
#endif
+}
- // Make PME stream wait on PP to PME data trasnfer events
- if (GMX_THREAD_MPI)
- {
- for (int i = 0; i < recvCount_; i++)
- {
- ppSync_[i]->enqueueWaitEvent(pmeStream_);
- }
- }
-
- // reset receive counter
- recvCount_ = 0;
+void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+ for (int i = 0; i < static_cast<int>(ppCommManagers_.size()); i++)
+ {
+ synchronizeOnCoordinatesFromPpRank(i, deviceStream);
}
}
+DeviceStream* PmeCoordinateReceiverGpu::Impl::ppCommStream(int senderIndex)
+{
+ return ppCommManagers_[senderIndex].stream.get();
+}
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& pmeStream,
- MPI_Comm comm,
+std::tuple<int, int> PmeCoordinateReceiverGpu::Impl::ppCommAtomRange(int senderIndex)
+{
+ return ppCommManagers_[senderIndex].atomRange;
+}
+
+int PmeCoordinateReceiverGpu::Impl::ppCommNumSenderRanks()
+{
+ return ppCommManagers_.size();
+}
+
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm comm,
+ const DeviceContext& deviceContext,
gmx::ArrayRef<PpRanks> ppRanks) :
- impl_(new Impl(pmeStream, comm, ppRanks))
+ impl_(new Impl(comm, deviceContext, ppRanks))
{
}
PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
+void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
{
- impl_->sendCoordinateBufferAddressToPpRanks(d_x);
+ impl_->reinitCoordinateReceiver(d_x);
}
void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
}
-void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
+int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int senderIndex,
+ const DeviceStream& deviceStream)
{
- impl_->synchronizeOnCoordinatesFromPpRanks();
+ return impl_->synchronizeOnCoordinatesFromPpRank(senderIndex, deviceStream);
}
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+ impl_->synchronizeOnCoordinatesFromAllPpRanks(deviceStream);
+}
+
+DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int senderIndex)
+{
+ return impl_->ppCommStream(senderIndex);
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::ppCommAtomRange(int senderIndex)
+{
+ return impl_->ppCommAtomRange(senderIndex);
+}
+
+int PmeCoordinateReceiverGpu::ppCommNumSenderRanks()
+{
+ return impl_->ppCommNumSenderRanks();
+}
+
+
} // namespace gmx
namespace gmx
{
-/*! \internal \brief Class with interfaces and data for CUDA version of PME coordinate receiving functionality */
+/*! \brief Object to manage communications with a specific PP rank */
+struct PpCommManager
+{
+ //! Details of PP rank that may be updated after repartitioning
+ const PpRanks& ppRank;
+ //! Stream used communication with for PP rank
+ std::unique_ptr<DeviceStream> stream;
+ //! Synchronization event to receive from PP rank
+ GpuEventSynchronizer* sync = nullptr;
+ //! Range of atoms corresponding to PP rank
+ std::tuple<int, int> atomRange = { 0, 0 };
+};
+
+/*! \internal \brief Class with interfaces and data for CUDA version of PME coordinate receiving functionality */
class PmeCoordinateReceiverGpu::Impl
{
public:
/*! \brief Creates PME GPU coordinate receiver object
- * \param[in] pmeStream CUDA stream used for PME computations
* \param[in] comm Communicator used for simulation
+ * \param[in] deviceContext GPU context
* \param[in] ppRanks List of PP ranks
*/
- Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks);
+ Impl(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef<const PpRanks> ppRanks);
~Impl();
/*! \brief
- * send coordinates buffer address to PP rank
+ * Re-initialize: set atom ranges and, for thread-MPI case,
+ * send coordinates buffer address to PP rank.
+ * This is required after repartitioning since atom ranges and
+ * buffer allocations may have changed.
* \param[in] d_x coordinates buffer in GPU memory
*/
- void sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x);
+ void reinitCoordinateReceiver(DeviceBuffer<RVec> d_x);
/*! \brief
* Receive coordinate synchronizer pointer from the PP ranks.
void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
/*! \brief
- * For lib MPI, wait for coordinates from PP ranks
- * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+ * For lib MPI, wait for coordinates from any PP rank
+ * For thread MPI, enqueue PP co-ordinate transfer event received from PP
+ * rank determined from pipeline stage into given stream
+ * \param[in] pipelineStage stage of pipeline corresponding to this transfer
+ * \param[in] deviceStream stream in which to enqueue the wait event.
+ * \returns rank of sending PP task
+ */
+ int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream);
+
+ /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks,
+ * enqueueing all events to a single stream
+ * \param[in] deviceStream stream in which to enqueue the wait events.
+ */
+ void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream);
+
+ /*! \brief
+ * Return pointer to stream associated with specific PP rank sender index
+ * \param[in] senderIndex Index of sender PP rank.
+ */
+ DeviceStream* ppCommStream(int senderIndex);
+
+ /*! \brief
+ * Returns range of atoms involved in communication associated with specific PP rank sender
+ * index \param[in] senderIndex Index of sender PP rank.
+ */
+ std::tuple<int, int> ppCommAtomRange(int senderIndex);
+
+ /*! \brief
+ * Return number of PP ranks involved in PME-PP communication
*/
- void synchronizeOnCoordinatesFromPpRanks();
+ int ppCommNumSenderRanks();
private:
- //! CUDA stream for PME operations
- const DeviceStream& pmeStream_;
//! communicator for simulation
MPI_Comm comm_;
- //! list of PP ranks
- gmx::ArrayRef<PpRanks> ppRanks_;
- //! vector of MPI requests
- std::vector<MPI_Request> request_;
- //! vector of synchronization events to receive from PP tasks
- std::vector<GpuEventSynchronizer*> ppSync_;
- //! counter of messages to receive
- int recvCount_ = 0;
+ //! MPI requests, one per PP rank
+ std::vector<MPI_Request> requests_;
+ //! GPU context handle (not used in CUDA)
+ const DeviceContext& deviceContext_;
+ //! Communication manager objects corresponding to multiple sending PP ranks
+ std::vector<PpCommManager> ppCommManagers_;
};
} // namespace gmx
#include "gromacs/utility/fatalerror.h"
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#include "pme_gpu_internal.h"
#include "pme_gpu_settings.h"
}
}
-void pme_gpu_launch_spread(gmx_pme_t* pme,
- GpuEventSynchronizer* xReadyOnDevice,
- gmx_wallcycle* wcycle,
- const real lambdaQ)
+void pme_gpu_launch_spread(gmx_pme_t* pme,
+ GpuEventSynchronizer* xReadyOnDevice,
+ gmx_wallcycle* wcycle,
+ const real lambdaQ,
+ const bool useGpuDirectComm,
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
{
GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode,
const bool spreadCharges = true;
wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
- pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ);
+ pme_gpu_spread(
+ pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
}
#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#if GMX_GPU_CUDA
# include "pme.cuh"
*/
#if GMX_GPU_CUDA
- pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
+ pmeGpu->kernelParams->usePipeline = false;
+ pmeGpu->kernelParams->pipelineAtomStart = 0;
+ pmeGpu->kernelParams->pipelineAtomEnd = 0;
+ pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0];
#else
// Use this path for any non-CUDA GPU acceleration
// TODO: is there no really global work size limit in OpenCL?
return kernelPtr;
}
-void pme_gpu_spread(const PmeGpu* pmeGpu,
- GpuEventSynchronizer* xReadyOnDevice,
- real** h_grids,
- bool computeSplines,
- bool spreadCharges,
- const real lambda)
+void pme_gpu_spread(const PmeGpu* pmeGpu,
+ GpuEventSynchronizer* xReadyOnDevice,
+ real** h_grids,
+ bool computeSplines,
+ bool spreadCharges,
+ const real lambda,
+ const bool useGpuDirectComm,
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu)
{
GMX_ASSERT(
pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2,
PmeStage timingId;
PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr;
+ const bool writeGlobalOrSaveSplines = writeGlobal || (!recalculateSplines);
if (computeSplines)
{
if (spreadCharges)
timingId = PmeStage::SplineAndSpread;
kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
+ writeGlobalOrSaveSplines,
pmeGpu->common->ngrids);
}
else
timingId = PmeStage::Spline;
kernelPtr = selectSplineKernelPtr(pmeGpu,
pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
+ writeGlobalOrSaveSplines,
pmeGpu->common->ngrids);
}
}
else
{
timingId = PmeStage::Spread;
- kernelPtr = selectSpreadKernelPtr(pmeGpu,
- pmeGpu->settings.threadsPerAtom,
- writeGlobal || (!recalculateSplines),
- pmeGpu->common->ngrids);
+ kernelPtr = selectSpreadKernelPtr(
+ pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids);
}
pme_gpu_start_timing(pmeGpu, timingId);
auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+
+ kernelParamsPtr->usePipeline = computeSplines && spreadCharges && useGpuDirectComm
+ && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1)
+ && !writeGlobalOrSaveSplines;
+ if (kernelParamsPtr->usePipeline)
+ {
+ int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks();
+
+ for (int i = 0; i < numStagesInPipeline; i++)
+ {
+ int senderRank;
+ if (useGpuDirectComm)
+ {
+ senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank(
+ i, *(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ }
+ else
+ {
+ senderRank = i;
+ }
+
+ // set kernel configuration options specific to this stage of the pipeline
+ std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) =
+ pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank);
+ const int blockCount = static_cast<int>(std::ceil(
+ static_cast<float>(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart)
+ / atomsPerBlock));
+ auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount);
+ config.gridSize[0] = dimGrid.first;
+ config.gridSize[1] = dimGrid.second;
+ DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank);
+
+
#if c_canEmbedBuffers
- const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
#else
- const auto kernelArgs =
- prepareGpuKernelArguments(kernelPtr,
- config,
- kernelParamsPtr,
- &kernelParamsPtr->atoms.d_theta,
- &kernelParamsPtr->atoms.d_dtheta,
- &kernelParamsPtr->atoms.d_gridlineIndices,
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
- &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
- &kernelParamsPtr->grid.d_fractShiftsTable,
- &kernelParamsPtr->grid.d_gridlineIndicesTable,
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
- &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
- &kernelParamsPtr->atoms.d_coordinates);
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
#endif
- launchGpuKernel(
- kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs);
+ launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs);
+ }
+ // Set dependencies for PME stream on all pipeline streams
+ for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++)
+ {
+ GpuEventSynchronizer event;
+ event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i)));
+ event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_);
+ }
+ }
+ else // pipelining is not in use
+ {
+ if (useGpuDirectComm) // Sync all PME-PP communications to PME stream
+ {
+ pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_);
+ }
+
+#if c_canEmbedBuffers
+ const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+#else
+ const auto kernelArgs =
+ prepareGpuKernelArguments(kernelPtr,
+ config,
+ kernelParamsPtr,
+ &kernelParamsPtr->atoms.d_theta,
+ &kernelParamsPtr->atoms.d_dtheta,
+ &kernelParamsPtr->atoms.d_gridlineIndices,
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A],
+ &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B],
+ &kernelParamsPtr->grid.d_fractShiftsTable,
+ &kernelParamsPtr->grid.d_gridlineIndicesTable,
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A],
+ &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B],
+ &kernelParamsPtr->atoms.d_coordinates);
+#endif
+
+ launchGpuKernel(kernelPtr,
+ config,
+ pmeGpu->archSpecific->pmeStream_,
+ timingEvent,
+ "PME spline/spread",
+ kernelArgs);
+ }
+
pme_gpu_stop_timing(pmeGpu, timingId);
const auto& settings = pmeGpu->settings;
/*! \libinternal \brief
* A GPU spline computation and charge spreading function.
*
- * \param[in] pmeGpu The PME GPU structure.
- * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory;
- * can be nullptr when invoked on a separate PME rank or from PME tests.
- * \param[out] h_grids The host-side grid buffers (used only if the result of the spread is expected on the host,
- * e.g. testing or host-side FFT)
- * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed.
- * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
- * \param[in] lambda The lambda value of the current system state.
- */
-GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
- GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
- float** GPU_FUNC_ARGUMENT(h_grids),
- bool GPU_FUNC_ARGUMENT(computeSplines),
- bool GPU_FUNC_ARGUMENT(spreadCharges),
- real GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM;
+ * \param[in] pmeGpu The PME GPU structure.
+ * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are
+ * ready in the device memory; can be nullptr when invoked
+ * on a separate PME rank or from PME tests.
+ * \param[out] h_grids The host-side grid buffers (used only if the result
+ * of the spread is expected on the host, e.g. testing
+ * or host-side FFT)
+ * \param[in] computeSplines Should the computation of spline parameters and gridline
+ * indices be performed.
+ * \param[in] spreadCharges Should the charges/coefficients be spread on the grid.
+ * \param[in] lambda The lambda value of the current system state.
+ * \param[in] useGpuDirectComm Whether direct GPU PME-PP communication is active
+ * \param[in] pmeCoordinateReceiverGpu Coordinate receiver object, which must be valid when
+ * direct GPU PME-PP communication is active
+ */
+GPU_FUNC_QUALIFIER void
+pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu),
+ GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice),
+ float** GPU_FUNC_ARGUMENT(h_grids),
+ bool GPU_FUNC_ARGUMENT(computeSplines),
+ bool GPU_FUNC_ARGUMENT(spreadCharges),
+ real GPU_FUNC_ARGUMENT(lambda),
+ const bool GPU_FUNC_ARGUMENT(useGpuDirectComm),
+ gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM;
/*! \libinternal \brief
* 3D FFT R2C/C2R routine.
* before launching spreading.
*/
struct PmeGpuDynamicParams current;
+
+ /*! \brief Whether pipelining with PP communications is active
+ * char rather than bool to avoid problem with OpenCL compiler */
+ char usePipeline;
+ /*! \brief Start atom for this stage of pipeline */
+ int pipelineAtomStart;
+ /*! \brief End atom for this stage of pipeline */
+ int pipelineAtomEnd;
+
/* These texture objects are only used in CUDA and are related to the grid size. */
/*! \brief Texture object for accessing grid.d_fractShiftsTable */
HIDE_FROM_OPENCL_COMPILER(DeviceTexture) fractShiftsTableTexture;
}
/*! \brief Called by PME-only ranks to receive coefficients and coordinates
+ *
+ * Note that with GPU direct communication the transfer is only initiated, it is the responsibility
+ * of the caller to synchronize prior to launching spread.
*
* \param[in] pme PME data structure.
* \param[in,out] pme_pp PME-PP communication structure.
"GPU Direct PME-PP communication has been enabled, "
"but PME run mode is not PmeRunMode::GPU\n");
- // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses.
- pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks(
- stateGpu->getCoordinates());
+ // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses and re-set atom ranges associated with transfers.
+ pme_pp->pmeCoordinateReceiverGpu->reinitCoordinateReceiver(stateGpu->getCoordinates());
pme_pp->pmeForceSenderGpu->setForceSendBuffer(pme_gpu_get_device_f(pme));
}
}
}
}
- if (pme_pp->useGpuDirectComm)
- {
- pme_pp->pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRanks();
- }
-
status = pmerecvqxX;
}
if (useGpuPmePpCommunication)
{
pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
- deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
- pme_pp->mpi_comm_mysim,
- pme_pp->ppRanks);
+ pme_pp->mpi_comm_mysim, deviceStreamManager->context(), pme_pp->ppRanks);
pme_pp->pmeForceSenderGpu =
std::make_unique<gmx::PmeForceSenderGpu>(pme_gpu_get_f_ready_synchronizer(pme),
pme_pp->mpi_comm_mysim,
// TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157
auto xReadyOnDevice = nullptr;
- pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle, lambda_q);
+ pme_gpu_launch_spread(pme,
+ xReadyOnDevice,
+ wcycle,
+ lambda_q,
+ pme_pp->useGpuDirectComm,
+ pme_pp->pmeCoordinateReceiverGpu.get());
pme_gpu_launch_complex_transforms(pme, wcycle, stepWork);
pme_gpu_launch_gather(pme, wcycle, lambda_q);
output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle);
float atomCharge;
const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x;
- const int atomIndexOffset = blockIndex * atomsPerBlock;
+ const int atomIndexOffset = blockIndex * atomsPerBlock + kernelParams.pipelineAtomStart;
/* Thread index w.r.t. block */
const int threadLocalId =
/* Charges, required for both spline and spread */
if (c_useAtomDataPrefetch)
{
- pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(sm_coefficients,
- kernelParams.atoms.d_coefficients[0]);
+ pme_gpu_stage_atom_data<float, atomsPerBlock, 1>(
+ sm_coefficients, &kernelParams.atoms.d_coefficients[0][kernelParams.pipelineAtomStart]);
__syncthreads();
atomCharge = sm_coefficients[atomIndexLocal];
}
if (computeSplines)
{
- const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates);
+ const float3* __restrict__ gm_coordinates =
+ asFloat3(&kernelParams.atoms.d_coordinates[kernelParams.pipelineAtomStart]);
if (c_useAtomDataPrefetch)
{
// Coordinates
/* Spreading */
if (spreadCharges)
{
- spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(
- kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+
+ if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
+ {
+ spread_charges<order, wrapX, wrapY, 0, threadsPerAtom>(
+ kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+ }
}
if (numGrids == 2)
{
}
if (spreadCharges)
{
- spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(
- kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+ if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd))
+ {
+ spread_charges<order, wrapX, wrapY, 1, threadsPerAtom>(
+ kernelParams, &atomCharge, sm_gridlineIndices, sm_theta);
+ }
}
}
}
#include "gromacs/utility/gmxassert.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/stringutil.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#include "testutils/test_hardware_environment.h"
#include "testutils/testasserts.h"
const real lambdaQ = 1.0;
// no synchronization needed as x is transferred in the PME stream
GpuEventSynchronizer* xReadyOnDevice = nullptr;
- pme_gpu_spread(pme->gpu, xReadyOnDevice, fftgrid, computeSplines, spreadCharges, lambdaQ);
+
+ bool useGpuDirectComm = false;
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+
+ pme_gpu_spread(pme->gpu,
+ xReadyOnDevice,
+ fftgrid,
+ computeSplines,
+ spreadCharges,
+ lambdaQ,
+ useGpuDirectComm,
+ pmeCoordinateReceiverGpu);
}
break;
#endif
#include "gromacs/domdec/partition.h"
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#include "gromacs/ewald/pme_pp.h"
#include "gromacs/ewald/pme_pp_comm_gpu.h"
#include "gromacs/gmxlib/network.h"
gmx_wallcycle* wcycle)
{
pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
- pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
+ bool useGpuDirectComm = false;
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+ pme_gpu_launch_spread(
+ pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
}
/*! \brief Launch the FFT and gather stages of PME GPU