/*! \brief
* Send force to PP rank (used with Thread-MPI)
- * \param[in] ppRank PP rank to receive data
- * \param[in] numAtoms number of atoms to send
+ * \param[in] ppRank PP rank to receive data
+ * \param[in] numAtoms number of atoms to send
+ * \param[in] sendForcesDirectToPpGpu whether forces are transferred direct to remote GPU memory
*/
- void sendFToPpCudaDirect(int ppRank, int numAtoms);
+ void sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu);
/*! \brief
* Send force to PP rank (used with Lib-MPI)
"correct implementation.");
}
-void PmeForceSenderGpu::sendFToPpCudaDirect(int /* ppRank */, int /* numAtoms */)
+void PmeForceSenderGpu::sendFToPpCudaDirect(int /* ppRank */,
+ int /* numAtoms */,
+ bool /* sendForcesDirectToPpGpu */)
{
GMX_ASSERT(!impl_,
"A CPU stub for PME-PP GPU communication was called instead of the correct "
MPI_Comm comm,
const DeviceContext& deviceContext,
gmx::ArrayRef<PpRanks> ppRanks) :
- pmeForcesReady_(pmeForcesReady), comm_(comm), ppRanks_(ppRanks), deviceContext_(deviceContext)
+ pmeForcesReady_(pmeForcesReady),
+ comm_(comm),
+ ppRanks_(ppRanks),
+ deviceContext_(deviceContext),
+ ppCommStream_(ppRanks.size()),
+ ppCommEvent_(ppRanks.size()),
+ pmeRemoteGpuForcePtr_(ppRanks.size()),
+ pmeRemoteCpuForcePtr_(ppRanks.size())
{
// Create streams and events to manage pushing of force buffers to remote PP ranks
std::unique_ptr<DeviceStream> stream;
std::unique_ptr<GpuEventSynchronizer> event;
size_t i = 0;
- ppCommStream_.resize(ppRanks_.size());
- ppCommEvent_.resize(ppRanks_.size());
for (i = 0; i < ppRanks_.size(); i++)
{
stream = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::High, false);
ind_start = ind_end;
ind_end = ind_start + receiver.numAtoms;
- localForcePtr_[i++] = &d_f[ind_start];
+ localForcePtr_[i] = &d_f[ind_start];
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
+ MPI_Recv(&pmeRemoteGpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
+ MPI_Recv(&pmeRemoteCpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+ i++;
}
+
#else
GMX_UNUSED_VALUE(d_f);
#endif
/*! \brief Send PME synchronizer directly using CUDA memory copy */
-void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms)
+void PmeForceSenderGpu::Impl::sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu)
{
GMX_ASSERT(GMX_THREAD_MPI, "sendFToPpCudaDirect is expected to be called only for Thread-MPI");
#if GMX_MPI
- void* pmeRemoteForcePtr;
- // NOLINTNEXTLINE(bugprone-sizeof-expression)
- MPI_Recv(&pmeRemoteForcePtr, sizeof(void*), MPI_BYTE, ppRank, 0, comm_, MPI_STATUS_IGNORE);
+ float3* pmeRemoteForcePtr =
+ sendForcesDirectToPpGpu ? pmeRemoteGpuForcePtr_[ppRank] : pmeRemoteCpuForcePtr_[ppRank];
pmeForcesReady_->enqueueWaitEvent(*ppCommStream_[ppRank]);
impl_->sendFToPpCudaMpi(sendbuf, offset, numBytes, ppRank, request);
}
-void PmeForceSenderGpu::sendFToPpCudaDirect(int ppRank, int numAtoms)
+void PmeForceSenderGpu::sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu)
{
- impl_->sendFToPpCudaDirect(ppRank, numAtoms);
+ impl_->sendFToPpCudaDirect(ppRank, numAtoms, sendForcesDirectToPpGpu);
}
/*! \brief
* Send force to PP rank (used with Thread-MPI)
- * \param[in] ppRank PP rank to receive data
- * \param[in] numAtoms number of atoms to send
+ * \param[in] ppRank PP rank to receive data
+ * \param[in] numAtoms number of atoms to send
+ * \param[in] sendForcesDirectToPpGpu whether forces are transferred direct to remote GPU memory
*/
- void sendFToPpCudaDirect(int ppRank, int numAtoms);
+ void sendFToPpCudaDirect(int ppRank, int numAtoms, bool sendForcesDirectToPpGpu);
/*! \brief
* Send force to PP rank (used with Lib-MPI)
std::vector<DeviceBuffer<RVec>> localForcePtr_;
//! GPU context handle (not used in CUDA)
const DeviceContext& deviceContext_;
+ //! Vector of CPU force buffer pointers for multiple remote PP tasks
+ std::vector<float3*> pmeRemoteCpuForcePtr_;
+ //! Vector of GPU force buffer pointers for multiple remote PP tasks
+ std::vector<float3*> pmeRemoteGpuForcePtr_;
};
} // namespace gmx
/*! \brief whether GPU direct communications are active for PME-PP transfers */
bool useGpuDirectComm = false;
+ /*! \brief whether GPU direct communications should send forces directly to remote GPU memory */
+ bool sendForcesDirectToPpGpu = false;
};
/*! \brief Initialize the PME-only side of the PME <-> PP communication */
GMX_ASSERT(!pme_pp->useGpuDirectComm || (pme_pp->pmeForceSenderGpu != nullptr),
"The use of GPU direct communication for PME-PP is enabled, "
"but the PME GPU force reciever object does not exist");
+ pme_pp->sendForcesDirectToPpGpu = ((cnb.flags & PP_PME_RECVFTOGPU) != 0);
if (cnb.flags & PP_PME_FINISH)
{
if (GMX_THREAD_MPI)
{
- pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(receiver.rankId, receiver.numAtoms);
+ pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+ receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
}
else
{
bool useGpuPmePpComms,
bool reinitGpuPmePpComms,
bool sendCoordinatesFromGpu,
+ bool receiveForcesToGpu,
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
gmx_domdec_t* dd;
if (useGpuPmePpComms)
{
flags |= PP_PME_GPUCOMMS;
+ if (receiveForcesToGpu)
+ {
+ flags |= PP_PME_RECVFTOGPU;
+ }
}
if (c_useDelayedWait)
{
if (reinitGpuPmePpComms)
{
+ std::vector<gmx::RVec>& buffer = cr->dd->pmeForceReceiveBuffer;
+ buffer.resize(n);
fr->pmePpCommGpu->reinit(n);
}
false,
false,
false,
+ false,
nullptr);
}
bool useGpuPmePpComms,
bool receiveCoordinateAddressFromPme,
bool sendCoordinatesFromGpu,
+ bool receiveForcesToGpu,
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
gmx_wallcycle* wcycle)
{
useGpuPmePpComms,
receiveCoordinateAddressFromPme,
sendCoordinatesFromGpu,
+ receiveForcesToGpu,
coordinatesReadyOnDeviceEvent);
wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX);
unsigned int flags = PP_PME_FINISH;
gmx_pme_send_coeffs_coords(
- nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, nullptr);
+ nullptr, cr, flags, {}, {}, {}, {}, {}, {}, nullptr, gmx::ArrayRef<gmx::RVec>(), 0, 0, 0, 0, -1, false, false, false, false, nullptr);
}
void gmx_pme_send_switchgrid(const t_commrec* cr, ivec grid_size, real ewaldcoeff_q, real ewaldcoeff_lj)
bool useGpuPmePpComms,
bool reinitGpuPmePpComms,
bool sendCoordinatesFromGpu,
+ bool receiveForcesToGpu,
GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
gmx_wallcycle* wcycle);
#define GMX_PME_PP_COMM_GPU_H
#include <memory>
+#include <vector>
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/math/vectypes.h"
public:
/*! \brief Creates PME-PP GPU communication object
- * \param[in] comm Communicator used for simulation
- * \param[in] pmeRank Rank of PME task
- * \param[in] deviceContext GPU context.
- * \param[in] deviceStream GPU stream.
+ * \param[in] comm Communicator used for simulation
+ * \param[in] pmeRank Rank of PME task
+ * \param[in] pmeCpuForceBuffer Buffer for PME force in CPU memory
+ * \param[in] deviceContext GPU context.
+ * \param[in] deviceStream GPU stream.
*/
- PmePpCommGpu(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+ PmePpCommGpu(MPI_Comm comm,
+ int pmeRank,
+ std::vector<gmx::RVec>& pmeCpuForceBuffer,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream);
~PmePpCommGpu();
/*! \brief Perform steps required when buffer size changes
/*!\brief Constructor stub. */
PmePpCommGpu::PmePpCommGpu(MPI_Comm /* comm */,
int /* pmeRank */,
+ std::vector<gmx::RVec>& /* pmeCpuForceBuffer */,
const DeviceContext& /* deviceContext */,
const DeviceStream& /* deviceStream */) :
impl_(nullptr)
namespace gmx
{
-PmePpCommGpu::Impl::Impl(MPI_Comm comm,
- int pmeRank,
- const DeviceContext& deviceContext,
- const DeviceStream& deviceStream) :
+PmePpCommGpu::Impl::Impl(MPI_Comm comm,
+ int pmeRank,
+ std::vector<gmx::RVec>& pmeCpuForceBuffer,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream) :
deviceContext_(deviceContext),
pmePpCommStream_(deviceStream),
comm_(comm),
pmeRank_(pmeRank),
+ pmeCpuForceBuffer_(pmeCpuForceBuffer),
d_pmeForces_(nullptr)
{
}
void PmePpCommGpu::Impl::reinit(int size)
{
+ // Reallocate device buffer used for staging PME force
+ reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
+
// This rank will access PME rank memory directly, so needs to receive the remote PME buffer addresses.
#if GMX_MPI
if (GMX_THREAD_MPI)
{
- // receive device buffer address from PME rank
+ // receive device coordinate buffer address from PME rank
MPI_Recv(&remotePmeXBuffer_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+ // send host and device force buffer addresses to PME rank
+ MPI_Send(&d_pmeForces_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_);
+ RVec* pmeCpuForceBufferData = pmeCpuForceBuffer_.data();
+ MPI_Send(&pmeCpuForceBufferData, sizeof(RVec*), MPI_BYTE, pmeRank_, 0, comm_);
}
#endif
-
- // Reallocate buffer used for staging PME force on GPU
- reallocateDeviceBuffer(&d_pmeForces_, size, &d_pmeForcesSize_, &d_pmeForcesSizeAlloc_, deviceContext_);
}
// TODO make this asynchronous by splitting into this into
// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
-void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(float3* recvPtr, bool receivePmeForceToGpu)
+void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu)
{
#if GMX_MPI
// Remote PME task pushes GPU data directly data to this PP task.
- void* localForcePtr = receivePmeForceToGpu ? static_cast<void*>(d_pmeForces_) : recvPtr;
-
- // Send destination pointer to PME task. Do this every step since
- // PME task is agostic as to whether destination is PP CPU or
- // GPU.
- // NOLINTNEXTLINE(bugprone-sizeof-expression)
- MPI_Send(&localForcePtr, sizeof(void*), MPI_BYTE, pmeRank_, 0, comm_);
-
// Recieve event from PME task after PME->PP force data push has
// been scheduled and enqueue this to PP stream.
GpuEventSynchronizer* eventptr;
float3* pmeForcePtr = receivePmeForceToGpu ? asFloat3(d_pmeForces_) : recvPtr;
if (GMX_THREAD_MPI)
{
- receiveForceFromPmeCudaDirect(pmeForcePtr, receivePmeForceToGpu);
+ receiveForceFromPmeCudaDirect(receivePmeForceToGpu);
}
else
{
}
}
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm,
- int pmeRank,
- const DeviceContext& deviceContext,
- const DeviceStream& deviceStream) :
- impl_(new Impl(comm, pmeRank, deviceContext, deviceStream))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm comm,
+ int pmeRank,
+ std::vector<gmx::RVec>& pmeCpuForceBuffer,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream) :
+ impl_(new Impl(comm, pmeRank, pmeCpuForceBuffer, deviceContext, deviceStream))
{
}
public:
/*! \brief Creates PME-PP GPU communication object.
*
- * \param[in] comm Communicator used for simulation
- * \param[in] pmeRank Rank of PME task
- * \param[in] deviceContext GPU context.
- * \param[in] deviceStream GPU stream.
+ * \param[in] comm Communicator used for simulation
+ * \param[in] pmeRank Rank of PME task
+ * \param[in] pmeCpuForceBuffer Buffer for PME force in CPU memory
+ * \param[in] deviceContext GPU context.
+ * \param[in] deviceStream GPU stream.
*/
- Impl(MPI_Comm comm, int pmeRank, const DeviceContext& deviceContext, const DeviceStream& deviceStream);
+ Impl(MPI_Comm comm,
+ int pmeRank,
+ std::vector<gmx::RVec>& pmeCpuForceBuffer,
+ const DeviceContext& deviceContext,
+ const DeviceStream& deviceStream);
~Impl();
/*! \brief Perform steps required when buffer size changes
/*! \brief Pull force buffer directly from GPU memory on PME
* rank to either GPU or CPU memory on PP task using CUDA
* Memory copy. This method is used with Thread-MPI.
- * \param[out] recvPtr CPU buffer to receive PME force data
* \param[in] receivePmeForceToGpu Whether receive is to GPU, otherwise CPU
*/
- void receiveForceFromPmeCudaDirect(float3* recvPtr, bool receivePmeForceToGpu);
+ void receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu);
/*! \brief Pull force buffer directly from GPU memory on PME
* rank to either GPU or CPU memory on PP task using CUDA-aware
MPI_Comm comm_;
//! Rank of PME task
int pmeRank_ = -1;
+ //! Buffer for PME force on CPU
+ std::vector<gmx::RVec>& pmeCpuForceBuffer_;
//! Buffer for staging PME force on GPU
DeviceBuffer<gmx::RVec> d_pmeForces_;
//! number of atoms in PME force staging array
#define PP_PME_SWITCHGRID (1 << 11)
#define PP_PME_RESETCOUNTERS (1 << 12)
#define PP_PME_GPUCOMMS (1 << 13)
+// Whether PME forces are transferred directly to remote PP GPU memory in a specific step
+#define PP_PME_RECVFTOGPU (1 << 14)
//@}
/*! \brief Return values for gmx_pme_recv_q_x */
simulationWork.useGpuPmePpCommunication,
reinitGpuPmePpComms,
pmeSendCoordinatesFromGpu,
+ stepWork.useGpuPmeFReduction,
localXReadyOnDevice,
wcycle);
}
fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
cr->mpi_comm_mysim,
cr->dd->pme_nodeid,
+ cr->dd->pmeForceReceiveBuffer,
deviceStreamManager->context(),
deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
}