the compiler to the MPI compiler wrapper but it is neither necessary
nor recommended.
+CUDA-Aware MPI support
+~~~~~~~~~~~~~~~~~~~~~~
+
+In simulations using multiple NVIDIA GPUs, an MPI implementation with CUDA support
+(also called "CUDA-aware") allows communication to be performed directly between the
+distinct GPU memory spaces without staging through CPU memory, often
+resulting in higher bandwidth and lower latency communication. For
+more details, see `Introduction to CUDA-aware MPI
+<https://developer.nvidia.com/blog/introduction-cuda-aware-mpi/>`_.
+
+To use CUDA-aware MPI for direct GPU communication we recommend
+using the latest OpenMPI version (>=4.1.0) with the latest UCX version
+(>=1.10), since most GROMACS internal testing on CUDA-aware support has
+been performed using these versions. OpenMPI with CUDA-aware support can
+be built following the procedure in `these OpenMPI build instructions
+<https://www.open-mpi.org/faq/?category=buildcuda>`_.
+
+With ``GPU_MPI=ON``, GROMACS attempts to automatically detect CUDA support
+in the underlying MPI library at compile time, and enables direct GPU
+communication when this is detected. However, there are some cases when
+GROMACS may fail to detect existing CUDA-aware support, in which case
+it can be manually enabled by setting environment variable ``GMX_FORCE_CUDA_AWARE_MPI=1``
+at runtime (although such cases still lack substantial
+testing, so we urge the user to carefully check correctness of results
+against those using default build options, and report any issues).
+
CMake
^^^^^
void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
/*! \brief
- * enqueue wait for coordinate data from PP ranks
+ * Used for lib MPI, receives co-ordinates from PP ranks
+ * \param[in] recvbuf coordinates buffer in GPU memory
+ * \param[in] numAtoms starting element in buffer
+ * \param[in] numBytes number of bytes to transfer
+ * \param[in] ppRank PP rank to send data
*/
- void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+ void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
+
+ /*! \brief
+ * For lib MPI, wait for coordinates from PP ranks
+ * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+ */
+ void synchronizeOnCoordinatesFromPpRanks();
private:
class Impl;
"implementation.");
}
-void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> /* recvbuf */,
+ int /* numAtoms */,
+ int /* numBytes */,
+ int /* ppRank */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub for PME-PP GPU communication was called instead of the correct "
+ "implementation.");
+}
+
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
{
GMX_ASSERT(!impl_,
"A CPU stub for PME-PP GPU communication was called instead of the correct "
*/
#include "gmxpre.h"
+#include "gromacs/ewald/pme_pp_communication.h"
#include "pme_coordinate_receiver_gpu_impl.h"
#include "config.h"
comm_(comm),
ppRanks_(ppRanks)
{
- GMX_RELEASE_ASSERT(
- GMX_THREAD_MPI,
- "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
request_.resize(ppRanks.size());
ppSync_.resize(ppRanks.size());
}
void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer<RVec> d_x)
{
-
- int ind_start = 0;
- int ind_end = 0;
- for (const auto& receiver : ppRanks_)
+ // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
+ if (GMX_THREAD_MPI)
{
- ind_start = ind_end;
- ind_end = ind_start + receiver.numAtoms;
-
- // Data will be transferred directly from GPU.
- void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
+ int ind_start = 0;
+ int ind_end = 0;
+ for (const auto& receiver : ppRanks_)
+ {
+ ind_start = ind_end;
+ ind_end = ind_start + receiver.numAtoms;
+ // Data will be transferred directly from GPU.
+ void* sendBuf = reinterpret_cast<void*>(&d_x[ind_start]);
#if GMX_MPI
- MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+ MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
#else
- GMX_UNUSED_VALUE(sendBuf);
+ GMX_UNUSED_VALUE(sendBuf);
#endif
+ }
}
}
/*! \brief Receive coordinate synchronizer pointer from the PP ranks. */
void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
{
+ GMX_ASSERT(GMX_THREAD_MPI,
+ "receiveCoordinatesSynchronizerFromPpCudaDirect is expected to be called only for "
+ "Thread-MPI");
+
// Data will be pushed directly from PP task
#if GMX_MPI
#endif
}
-void PmeCoordinateReceiverGpu::Impl::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+/*! \brief Receive coordinate data using CUDA-aware MPI */
+void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+ int numAtoms,
+ int numBytes,
+ int ppRank)
+{
+ GMX_ASSERT(GMX_LIB_MPI,
+ "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
+
+#if GMX_MPI
+ MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &request_[recvCount_++]);
+#else
+ GMX_UNUSED_VALUE(recvbuf);
+ GMX_UNUSED_VALUE(numAtoms);
+ GMX_UNUSED_VALUE(numBytes);
+ GMX_UNUSED_VALUE(ppRank);
+#endif
+}
+
+void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRanks()
{
if (recvCount_ > 0)
{
- // ensure PME calculation doesn't commence until coordinate data has been transferred
+ // ensure PME calculation doesn't commence until coordinate data/remote events
+ // has been transferred
#if GMX_MPI
MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE);
#endif
- for (int i = 0; i < recvCount_; i++)
+
+ // Make PME stream wait on PP to PME data trasnfer events
+ if (GMX_THREAD_MPI)
{
- ppSync_[i]->enqueueWaitEvent(pmeStream_);
+ for (int i = 0; i < recvCount_; i++)
+ {
+ ppSync_[i]->enqueueWaitEvent(pmeStream_);
+ }
}
+
// reset receive counter
recvCount_ = 0;
}
impl_->receiveCoordinatesSynchronizerFromPpCudaDirect(ppRank);
}
-void PmeCoordinateReceiverGpu::enqueueWaitReceiveCoordinatesFromPpCudaDirect()
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+ int numAtoms,
+ int numBytes,
+ int ppRank)
+{
+ impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
+}
+
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks()
{
- impl_->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+ impl_->synchronizeOnCoordinatesFromPpRanks();
}
} // namespace gmx
void receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank);
/*! \brief
- * enqueue wait for coordinate data from PP ranks
+ * Used for lib MPI, receives co-ordinates from PP ranks
+ * \param[in] recvbuf coordinates buffer in GPU memory
+ * \param[in] numAtoms starting element in buffer
+ * \param[in] numBytes number of bytes to transfer
+ * \param[in] ppRank PP rank to send data
*/
- void enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+ void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf, int numAtoms, int numBytes, int ppRank);
+
+ /*! \brief
+ * For lib MPI, wait for coordinates from PP ranks
+ * For thread MPI, enqueue PP co-ordinate transfer event into PME stream
+ */
+ void synchronizeOnCoordinatesFromPpRanks();
private:
//! CUDA stream for PME operations
void sendForceBufferAddressToPpRanks(DeviceBuffer<RVec> d_f);
/*! \brief
- * Send force synchronizer to PP rank
+ * Send force synchronizer to PP rank (used with Thread-MPI)
* \param[in] ppRank PP rank to receive data
*/
void sendFSynchronizerToPpCudaDirect(int ppRank);
+ /*! \brief
+ * Send force to PP rank (used with Lib-MPI)
+ * \param[in] sendbuf force buffer in GPU memory
+ * \param[in] offset starting element in buffer
+ * \param[in] numBytes number of bytes to transfer
+ * \param[in] ppRank PP rank to receive data
+ * \param[in] request MPI request to track asynchronous MPI call status
+ */
+ void sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request);
+
private:
class Impl;
std::unique_ptr<Impl> impl_;
"implementation.");
}
+void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer<RVec> /* sendbuf */,
+ int /* offset */,
+ int /* numBytes */,
+ int /* ppRank */,
+ MPI_Request* /* request */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub for PME-PP GPU communication was called instead of the correct "
+ "implementation.");
+}
+
} // namespace gmx
#endif // !GMX_GPU_CUDA
comm_(comm),
ppRanks_(ppRanks)
{
- GMX_RELEASE_ASSERT(
- GMX_THREAD_MPI,
- "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
}
PmeForceSenderGpu::Impl::~Impl() = default;
/*! \brief sends force buffer address to PP ranks */
void PmeForceSenderGpu::Impl::sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f)
{
+ // Need to send address to PP rank only for thread-MPI as PP rank pulls
+ // data using cudamemcpy
+ if (!GMX_THREAD_MPI)
+ {
+ return;
+ }
+#if GMX_MPI
int ind_start = 0;
int ind_end = 0;
for (const auto& receiver : ppRanks_)
ind_end = ind_start + receiver.numAtoms;
// Data will be transferred directly from GPU.
- void* sendBuf = reinterpret_cast<void*>(&d_f[ind_start]);
+ Float3* sendBuf = &d_f[ind_start];
-#if GMX_MPI
- MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_);
+ MPI_Send(&sendBuf, sizeof(Float3*), MPI_BYTE, receiver.rankId, 0, comm_);
+ }
#else
- GMX_UNUSED_VALUE(sendBuf);
+ GMX_UNUSED_VALUE(d_f);
#endif
- }
}
/*! \brief Send PME synchronizer directly using CUDA memory copy */
void PmeForceSenderGpu::Impl::sendFSynchronizerToPpCudaDirect(int ppRank)
{
+ GMX_ASSERT(GMX_THREAD_MPI,
+ "sendFSynchronizerToPpCudaDirect is expected to be called only for Thread-MPI");
+
// Data will be pulled directly from PP task
#if GMX_MPI
// TODO Using MPI_Isend would be more efficient, particularly when
// sending to multiple PP ranks
MPI_Send(&pmeForcesReady_, sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
#else
- GMX_UNUSED_VALUE(pmeSyncPtr);
GMX_UNUSED_VALUE(ppRank);
#endif
}
+/*! \brief Send PME data directly using CUDA-aware MPI */
+void PmeForceSenderGpu::Impl::sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf,
+ int offset,
+ int numBytes,
+ int ppRank,
+ MPI_Request* request)
+{
+ GMX_ASSERT(GMX_LIB_MPI, "sendFToPpCudaMpi is expected to be called only for Lib-MPI");
+
+#if GMX_MPI
+ // if using GPU direct comm with CUDA-aware MPI, make sure forces are ready on device
+ // before sending it to PP ranks
+ pmeForcesReady_->waitForEvent();
+
+ MPI_Isend(sendbuf[offset], numBytes, MPI_BYTE, ppRank, 0, comm_, request);
+
+#else
+ GMX_UNUSED_VALUE(sendbuf);
+ GMX_UNUSED_VALUE(offset);
+ GMX_UNUSED_VALUE(numBytes);
+ GMX_UNUSED_VALUE(ppRank);
+ GMX_UNUSED_VALUE(request);
+#endif
+}
+
PmeForceSenderGpu::PmeForceSenderGpu(GpuEventSynchronizer* pmeForcesReady,
MPI_Comm comm,
gmx::ArrayRef<PpRanks> ppRanks) :
impl_->sendFSynchronizerToPpCudaDirect(ppRank);
}
+void PmeForceSenderGpu::sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf,
+ int offset,
+ int numBytes,
+ int ppRank,
+ MPI_Request* request)
+{
+ impl_->sendFToPpCudaMpi(sendbuf, offset, numBytes, ppRank, request);
+}
+
} // namespace gmx
void sendForceBufferAddressToPpRanks(DeviceBuffer<Float3> d_f);
/*! \brief
- * Send force synchronizer to PP rank
+ * Send force synchronizer to PP rank (used with Thread-MPI)
* \param[in] ppRank PP rank to receive data
*/
void sendFSynchronizerToPpCudaDirect(int ppRank);
+ /*! \brief
+ * Send force to PP rank (used with Lib-MPI)
+ * \param[in] sendbuf force buffer in GPU memory
+ * \param[in] offset starting element in buffer
+ * \param[in] numBytes number of bytes to transfer
+ * \param[in] ppRank PP rank to receive data
+ * \param[in] request MPI request to track asynchronous MPI call status
+ */
+ void sendFToPpCudaMpi(DeviceBuffer<RVec> sendbuf, int offset, int numBytes, int ppRank, MPI_Request* request);
+
private:
//! Event indicating when PME forces are ready on the GPU in order for PP stream to sync with the PME stream
GpuEventSynchronizer* pmeForcesReady_;
#include "pme_output.h"
#include "pme_pp_communication.h"
-/*! \brief environment variable to enable GPU P2P communication */
-static const bool c_enableGpuPmePpComms =
- GMX_GPU_CUDA && GMX_THREAD_MPI && (getenv("GMX_GPU_PME_PP_COMMS") != nullptr);
-
/*! \brief Master PP-PME communication data structure */
struct gmx_pme_pp
{
{
if (pme_pp->useGpuDirectComm)
{
- pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
- sender.rankId);
+ if (GMX_THREAD_MPI)
+ {
+ pme_pp->pmeCoordinateReceiverGpu->receiveCoordinatesSynchronizerFromPpCudaDirect(
+ sender.rankId);
+ }
+ else
+ {
+ pme_pp->pmeCoordinateReceiverGpu->launchReceiveCoordinatesFromPpCudaMpi(
+ stateGpu->getCoordinates(), nat, sender.numAtoms * sizeof(rvec), sender.rankId);
+ }
}
else
{
if (pme_pp->useGpuDirectComm)
{
- pme_pp->pmeCoordinateReceiverGpu->enqueueWaitReceiveCoordinatesFromPpCudaDirect();
+ pme_pp->pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRanks();
}
status = pmerecvqxX;
}
/*! \brief Send the PME mesh force, virial and energy to the PP-only ranks. */
-static void gmx_pme_send_force_vir_ener(gmx_pme_pp* pme_pp,
+static void gmx_pme_send_force_vir_ener(const gmx_pme_t& pme,
+ gmx_pme_pp* pme_pp,
const PmeOutput& output,
real dvdlambda_q,
real dvdlambda_lj,
ind_end = 0;
for (const auto& receiver : pme_pp->ppRanks)
{
- ind_start = ind_end;
- ind_end = ind_start + receiver.numAtoms;
- void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+ ind_start = ind_end;
+ ind_end = ind_start + receiver.numAtoms;
if (pme_pp->useGpuDirectComm)
{
GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
"The use of GPU direct communication for PME-PP is enabled, "
"but the PME GPU force reciever object does not exist");
- pme_pp->pmeForceSenderGpu->sendFSynchronizerToPpCudaDirect(receiver.rankId);
+
+ if (GMX_THREAD_MPI)
+ {
+ pme_pp->pmeForceSenderGpu->sendFSynchronizerToPpCudaDirect(receiver.rankId);
+ }
+ else
+ {
+ pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
+ ind_start,
+ receiver.numAtoms * sizeof(rvec),
+ receiver.rankId,
+ &pme_pp->req[messages]);
+
+ messages++;
+ }
}
else
{
+ void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
// Send using MPI
MPI_Isend(sendbuf,
receiver.numAtoms * sizeof(rvec),
MPI_Waitall(messages, pme_pp->req.data(), pme_pp->stat.data());
#else
GMX_RELEASE_ASSERT(false, "Invalid call to gmx_pme_send_force_vir_ener");
+ GMX_UNUSED_VALUE(pme);
GMX_UNUSED_VALUE(pme_pp);
GMX_UNUSED_VALUE(output);
GMX_UNUSED_VALUE(dvdlambda_q);
gmx_walltime_accounting_t walltime_accounting,
t_inputrec* ir,
PmeRunMode runMode,
+ bool useGpuPmePpCommunication,
const gmx::DeviceStreamManager* deviceStreamManager)
{
int ret;
"Device stream can not be nullptr when using GPU in PME-only rank");
changePinningPolicy(&pme_pp->chargeA, pme_get_pinning_policy());
changePinningPolicy(&pme_pp->x, pme_get_pinning_policy());
- if (c_enableGpuPmePpComms)
+ if (useGpuPmePpCommunication)
{
pme_pp->pmeCoordinateReceiverGpu = std::make_unique<gmx::PmeCoordinateReceiverGpu>(
deviceStreamManager->stream(gmx::DeviceStreamType::Pme),
}
cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
- gmx_pme_send_force_vir_ener(pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
+ gmx_pme_send_force_vir_ener(*pme, pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
count++;
} /***** end of quasi-loop, we stop with the break above */
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
gmx_walltime_accounting_t walltime_accounting,
t_inputrec* ir,
PmeRunMode runMode,
+ bool useGpuPmePpCommunication,
const gmx::DeviceStreamManager* deviceStreamManager);
#endif
real* xRealPtr = const_cast<real*>(x[0]);
if (useGpuPmePpComms && (fr != nullptr))
{
- void* sendPtr = sendCoordinatesFromGpu
- ? static_cast<void*>(fr->stateGpu->getCoordinates())
- : static_cast<void*>(xRealPtr);
- fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(
- sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
+ if (sendCoordinatesFromGpu)
+ {
+ fr->pmePpCommGpu->sendCoordinatesToPmeFromGpu(
+ fr->stateGpu->getCoordinates(), n, coordinatesReadyOnDeviceEvent);
+ }
+ else
+ {
+ fr->pmePpCommGpu->sendCoordinatesToPmeFromCpu(
+ reinterpret_cast<gmx::RVec*>(xRealPtr), n, coordinatesReadyOnDeviceEvent);
+ }
}
else
{
if (useGpuPmePpComms)
{
GMX_ASSERT(pmePpCommGpu != nullptr, "Need valid pmePpCommGpu");
- // Receive directly using CUDA memory copy
- pmePpCommGpu->receiveForceFromPmeCudaDirect(recvptr, n, receivePmeForceToGpu);
+ // Receive forces from PME rank
+ pmePpCommGpu->receiveForceFromPme(static_cast<gmx::RVec*>(recvptr), n, receivePmeForceToGpu);
}
else
{
#include <memory>
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/math/vectypes.h"
#include "gromacs/utility/gmxmpi.h"
class DeviceContext;
* \param[in] recvSize Number of elements to receive
* \param[in] recvPmeForceToGpu Whether receive is to GPU, otherwise CPU
*/
- void receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool recvPmeForceToGpu);
+ void receiveForceFromPme(RVec* recvPtr, int recvSize, bool recvPmeForceToGpu);
/*! \brief Push coordinates buffer directly to GPU memory on PME task
* \param[in] sendPtr Buffer with coordinate data
* \param[in] sendSize Number of elements to send
- * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
* \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
*/
- void sendCoordinatesToPmeCudaDirect(void* sendPtr,
- int sendSize,
- bool sendPmeCoordinatesFromGpu,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+ void sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec> sendPtr,
+ int sendSize,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+ /*! \brief Push coordinates buffer from host memory directly to GPU memory on PME task
+ * \param[in] sendPtr Buffer with coordinate data
+ * \param[in] sendSize Number of elements to send
+ * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+ */
+ void sendCoordinatesToPmeFromCpu(RVec* sendPtr,
+ int sendSize,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
/*! \brief
* Return pointer to buffer used for staging PME force on GPU
"correct implementation.");
}
-void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* /* recvPtr */,
- int /* recvSize */,
- bool /* receivePmeForceToGpu */)
+void PmePpCommGpu::receiveForceFromPme(RVec* /* recvPtr */, int /* recvSize */, bool /* receivePmeForceToGpu */)
{
GMX_ASSERT(!impl_,
"A CPU stub for PME-PP GPU communication was called instead of the correct "
"implementation.");
}
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* /* sendPtr */,
- int /* sendSize */,
- bool /* sendPmeCoordinatesFromGpu */,
- GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
+void PmePpCommGpu::sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec> /* sendPtr */,
+ int /* sendSize */,
+ GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub for PME-PP GPU communication was called instead of the correct "
+ "implementation.");
+}
+
+void PmePpCommGpu::sendCoordinatesToPmeFromCpu(RVec* /* sendPtr */,
+ int /* sendSize */,
+ GpuEventSynchronizer* /* coordinatesOnDeviceEvent */)
{
GMX_ASSERT(!impl_,
"A CPU stub for PME-PP GPU communication was called instead of the correct "
*/
#include "gmxpre.h"
+#include "gromacs/ewald/pme_pp_communication.h"
#include "pme_pp_comm_gpu_impl.h"
#include "config.h"
#include "gromacs/gpu_utils/device_stream.h"
#include "gromacs/gpu_utils/devicebuffer.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/typecasts.cuh"
#include "gromacs/utility/gmxmpi.h"
namespace gmx
pmeRank_(pmeRank),
d_pmeForces_(nullptr)
{
- GMX_RELEASE_ASSERT(
- GMX_THREAD_MPI,
- "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
}
PmePpCommGpu::Impl::~Impl() = default;
{
// This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
#if GMX_MPI
- MPI_Recv(&remotePmeXBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
- MPI_Recv(&remotePmeFBuffer_, sizeof(void**), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+
+ if (GMX_THREAD_MPI)
+ {
+ // receive device buffer address from PME rank
+ MPI_Recv(&remotePmeXBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+ MPI_Recv(&remotePmeFBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+ }
+
+#endif
// Reallocate buffer used for staging PME force on GPU
reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
-#else
- GMX_UNUSED_VALUE(size);
-#endif
return;
}
-// TODO make this asynchronous by splitting into this into
-// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
-void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu)
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* pmeForcePtr, int recvSize, bool receivePmeForceToGpu)
{
#if GMX_MPI
// Receive event from PME task and add to stream, to ensure pull of data doesn't
GpuEventSynchronizer* pmeSync;
MPI_Recv(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
pmeSync->enqueueWaitEvent(pmePpCommStream_);
+#endif
// Pull force data from remote GPU
- void* pmeForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
- cudaError_t stat = cudaMemcpyAsync(pmeForcePtr,
+ cudaError_t stat = cudaMemcpyAsync(pmeForcePtr,
remotePmeFBuffer_,
recvSize * DIM * sizeof(float),
cudaMemcpyDefault,
{
// Ensure CPU waits for PME forces to be copied before reducing
// them with other forces on the CPU
- cudaStreamSynchronize(pmePpCommStream_.stream());
+ pmePpCommStream_.synchronize();
}
+}
+
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaMpi(float3* pmeForcePtr, int recvSize)
+{
+#if GMX_MPI
+ MPI_Recv(pmeForcePtr, recvSize * DIM, MPI_FLOAT, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
#else
- GMX_UNUSED_VALUE(recvPtr);
+ GMX_UNUSED_VALUE(pmeForcePtr);
GMX_UNUSED_VALUE(recvSize);
- GMX_UNUSED_VALUE(receivePmeForceToGpu);
#endif
}
-void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void* sendPtr,
- int sendSize,
- bool gmx_unused sendPmeCoordinatesFromGpu,
+void PmePpCommGpu::Impl::receiveForceFromPme(float3* recvPtr, int recvSize, bool receivePmeForceToGpu)
+{
+ float3* pmeForcePtr = receivePmeForceToGpu ? asFloat3(d_pmeForces_) : recvPtr;
+ if (GMX_THREAD_MPI)
+ {
+ receiveForceFromPmeCudaDirect(pmeForcePtr, recvSize, receivePmeForceToGpu);
+ }
+ else
+ {
+ receiveForceFromPmeCudaMpi(pmeForcePtr, recvSize);
+ }
+}
+
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(float3* sendPtr,
+ int sendSize,
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
-#if GMX_MPI
// ensure stream waits until coordinate data is available on device
coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
pmePpCommStream_.stream());
CU_RET_ERR(stat, "cudaMemcpyAsync on Send to PME CUDA direct data transfer failed");
+#if GMX_MPI
// Record and send event to allow PME task to sync to above transfer before commencing force calculations
pmeCoordinatesSynchronizer_.markEvent(pmePpCommStream_);
GpuEventSynchronizer* pmeSync = &pmeCoordinatesSynchronizer_;
MPI_Send(&pmeSync, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_);
+#endif
+}
+
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaMpi(float3* sendPtr,
+ int sendSize,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+ // ensure coordinate data is available on device before we start transfer
+ coordinatesReadyOnDeviceEvent->waitForEvent();
+
+#if GMX_MPI
+ float3* sendptr_x = sendPtr;
+
+ MPI_Send(sendptr_x, sendSize * DIM, MPI_FLOAT, pmeRank_, eCommType_COORD_GPU, comm_);
#else
GMX_UNUSED_VALUE(sendPtr);
GMX_UNUSED_VALUE(sendSize);
- GMX_UNUSED_VALUE(sendPmeCoordinatesFromGpu);
- GMX_UNUSED_VALUE(coordinatesReadyOnDeviceEvent);
#endif
}
+void PmePpCommGpu::Impl::sendCoordinatesToPme(float3* sendPtr,
+ int sendSize,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+{
+ if (GMX_THREAD_MPI)
+ {
+ sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, coordinatesReadyOnDeviceEvent);
+ }
+ else
+ {
+ sendCoordinatesToPmeCudaMpi(sendPtr, sendSize, coordinatesReadyOnDeviceEvent);
+ }
+}
DeviceBuffer<Float3> PmePpCommGpu::Impl::getGpuForceStagingPtr()
{
return d_pmeForces_;
GpuEventSynchronizer* PmePpCommGpu::Impl::getForcesReadySynchronizer()
{
- return &forcesReadySynchronizer_;
+ if (GMX_THREAD_MPI)
+ {
+ return &forcesReadySynchronizer_;
+ }
+ else
+ {
+ return nullptr;
+ }
}
PmePpCommGpu::PmePpCommGpu(MPI_Comm comm,
impl_->reinit(size);
}
-void PmePpCommGpu::receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu)
+void PmePpCommGpu::receiveForceFromPme(RVec* recvPtr, int recvSize, bool receivePmeForceToGpu)
+{
+ impl_->receiveForceFromPme(asFloat3(recvPtr), recvSize, receivePmeForceToGpu);
+}
+
+void PmePpCommGpu::sendCoordinatesToPmeFromGpu(DeviceBuffer<RVec> sendPtr,
+ int sendSize,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
- impl_->receiveForceFromPmeCudaDirect(recvPtr, recvSize, receivePmeForceToGpu);
+ impl_->sendCoordinatesToPme(asFloat3(sendPtr), sendSize, coordinatesReadyOnDeviceEvent);
}
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void* sendPtr,
- int sendSize,
- bool sendPmeCoordinatesFromGpu,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+void PmePpCommGpu::sendCoordinatesToPmeFromCpu(RVec* sendPtr,
+ int sendSize,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
- impl_->sendCoordinatesToPmeCudaDirect(
- sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
+ impl_->sendCoordinatesToPme(asFloat3(sendPtr), sendSize, coordinatesReadyOnDeviceEvent);
}
-DeviceBuffer<gmx::RVec> PmePpCommGpu::getGpuForceStagingPtr()
+DeviceBuffer<Float3> PmePpCommGpu::getGpuForceStagingPtr()
{
return impl_->getGpuForceStagingPtr();
}
#define GMX_PME_PP_COMM_GPU_IMPL_H
#include "gromacs/ewald/pme_pp_comm_gpu.h"
-#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#include "gromacs/gpu_utils/gputraits.h"
#include "gromacs/math/vectypes.h"
#include "gromacs/utility/gmxmpi.h"
/*! \brief Pull force buffer directly from GPU memory on PME
* rank to either GPU or CPU memory on PP task using CUDA
- * Memory copy.
+ * Memory copy or CUDA-aware MPI.
*
* recvPtr should be in GPU or CPU memory if recvPmeForceToGpu
* is true or false, respectively. If receiving to GPU, this
* \param[in] recvSize Number of elements to receive
* \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
*/
- void receiveForceFromPmeCudaDirect(void* recvPtr, int recvSize, bool receivePmeForceToGpu);
+ void receiveForceFromPme(float3* recvPtr, int recvSize, bool receivePmeForceToGpu);
/*! \brief Push coordinates buffer directly to GPU memory on PME
* task, from either GPU or CPU memory on PP task using CUDA
- * Memory copy. sendPtr should be in GPU or CPU memory if
- * sendPmeCoordinatesFromGpu is true or false respectively. If
- * sending from GPU, this method should be called after the
- * local GPU coordinate buffer operations. The remote PME task will
- * automatically wait for data to be copied before commencing PME force calculations.
+ * Memory copy or CUDA-aware MPI. If sending from GPU, this method should
+ * be called after the local GPU coordinate buffer operations.
+ * The remote PME task will automatically wait for data to be copied
+ * before commencing PME force calculations.
* \param[in] sendPtr Buffer with coordinate data
* \param[in] sendSize Number of elements to send
- * \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
* \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
*/
- void sendCoordinatesToPmeCudaDirect(void* sendPtr,
- int sendSize,
- bool sendPmeCoordinatesFromGpu,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+ void sendCoordinatesToPme(float3* sendPtr, int sendSize, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
/*! \brief
* Return pointer to buffer used for staging PME force on GPU
*/
GpuEventSynchronizer* getForcesReadySynchronizer();
+private:
+ /*! \brief Pull force buffer directly from GPU memory on PME
+ * rank to either GPU or CPU memory on PP task using CUDA
+ * Memory copy. This method is used with Thread-MPI.
+ * \param[out] recvPtr CPU buffer to receive PME force data
+ * \param[in] recvSize Number of elements to receive
+ * \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
+ */
+ void receiveForceFromPmeCudaDirect(float3* recvPtr, int recvSize, bool receivePmeForceToGpu);
+
+ /*! \brief Pull force buffer directly from GPU memory on PME
+ * rank to either GPU or CPU memory on PP task using CUDA-aware
+ * MPI. This method is used with process-MPI.
+ * \param[out] recvPtr CPU buffer to receive PME force data
+ * \param[in] recvSize Number of elements to receive
+ */
+ void receiveForceFromPmeCudaMpi(float3* recvPtr, int recvSize);
+
+ /*! \brief Push coordinates buffer directly to GPU memory on PME
+ * task, from either GPU or CPU memory on PP task using CUDA Memory copy.
+ * This method is used with Thread-MPI.
+ * \param[in] sendPtr Buffer with coordinate data
+ * \param[in] sendSize Number of elements to send
+ * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+ */
+ void sendCoordinatesToPmeCudaDirect(float3* sendPtr,
+ int sendSize,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
+ /*! \brief Push coordinates buffer directly to GPU memory on PME
+ * task, from either GPU or CPU memory on PP task using CUDA-aware MPI.
+ * This method is used with process-MPI.
+ * \param[in] sendPtr Buffer with coordinate data
+ * \param[in] sendSize Number of elements to send
+ * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
+ */
+ void sendCoordinatesToPmeCudaMpi(float3* sendPtr,
+ int sendSize,
+ GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+
private:
//! GPU context handle (not used in CUDA)
const DeviceContext& deviceContext_;
//! Handle for CUDA stream used for the communication operations in this class
const DeviceStream& pmePpCommStream_;
//! Remote location of PME coordinate data buffer
- void* remotePmeXBuffer_ = nullptr;
+ float3* remotePmeXBuffer_ = nullptr;
//! Remote location of PME force data buffer
- void* remotePmeFBuffer_ = nullptr;
+ float3* remotePmeFBuffer_ = nullptr;
//! communicator for simulation
MPI_Comm comm_;
//! Rank of PME task
* Copyright (c) 1991-2000, University of Groningen, The Netherlands.
* Copyright (c) 2001-2004, The GROMACS development team.
* Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
eCommType_SigmaB,
eCommType_NR,
eCommType_COORD,
+ eCommType_COORD_GPU,
eCommType_CNB
};
GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
- devFlags.enableGpuPmePPComm =
- GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
+ devFlags.enableGpuPmePPComm = GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
#pragma GCC diagnostic pop
// Direct GPU comm path is being used with CUDA_AWARE_MPI
// make sure underlying MPI implementation is CUDA-aware
- if (!GMX_THREAD_MPI && devFlags.enableGpuHaloExchange)
+ if (!GMX_THREAD_MPI && (devFlags.enableGpuPmePPComm || devFlags.enableGpuHaloExchange))
{
const bool haveDetectedCudaAwareMpi =
(checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
devFlags.usingCudaAwareMpi = true;
GMX_LOG(mdlog.warning)
.asParagraph()
- .appendTextFormatted("Using CUDA-aware MPI for 'GPU halo exchange' feature.");
+ .appendTextFormatted(
+ "Using CUDA-aware MPI for 'GPU halo exchange' or 'GPU PME-PP "
+ "communications' feature.");
}
else
{
"detect CUDA_aware support in underlying MPI implementation.");
devFlags.enableGpuHaloExchange = false;
}
+ if (devFlags.enableGpuPmePPComm)
+ {
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText(
+ "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
+ "'GPU PME-PP communications' feature will not be enabled as "
+ "GROMACS couldn't "
+ "detect CUDA_aware support in underlying MPI implementation.");
+ devFlags.enableGpuPmePPComm = false;
+ }
GMX_LOG(mdlog.warning)
.asParagraph()
walltime_accounting,
inputrec.get(),
pmeRunMode,
+ runScheduleWork.simulationWork.useGpuPmePpCommunication,
deviceStreamManager.get());
}