#include "gromacs/utility/gmxmpi.h"
struct gmx_domdec_t;
+class GpuEventSynchronizer;
namespace gmx
{
* the non-local non-bonded kernels). It also must be called
* after the local coordinates buffer operations (where the
* coordinates are copied to the device and hence the \c
- * coordinatesOnDeviceEvent is recorded). Force Halo exchange
+ * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
* will be performed in \c streamNonLocal (also potentally
* with buffer clearing in \c streamLocal)and the \c
* communicateHaloForces method must be called after the
* \param [in] mpi_comm_mysim communicator used for simulation
* \param [in] streamLocal local NB CUDA stream.
* \param [in] streamNonLocal non-local NB CUDA stream.
- * \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
*/
GpuHaloExchange(gmx_domdec_t *dd,
MPI_Comm mpi_comm_mysim,
void *streamLocal,
- void *streamNonLocal,
- void *coordinatesOnDeviceEvent);
+ void *streamNonLocal);
~GpuHaloExchange();
/*! \brief
* event when the coordinate data has been copied to the
* device).
* \param [in] box Coordinate box (from which shifts will be constructed)
+ * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
*/
- void communicateHaloCoordinates(const matrix box);
+ void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer *coordinatesReadyOnDeviceEvent);
/*! \brief GPU halo exchange of force buffer.
* \param[in] accumulateForces True if forces should accumulate, otherwise they are set
GpuHaloExchange::GpuHaloExchange(gmx_domdec_t * /* dd */,
MPI_Comm /* mpi_comm_mysim */,
void * /*streamLocal */,
- void * /*streamNonLocal */,
- void * /*coordinatesOnDeviceEvent*/)
+ void * /*streamNonLocal */)
: impl_(nullptr)
{
GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
}
/*!\brief apply X halo exchange stub. */
-void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */)
+void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
+ GpuEventSynchronizer * /*coordinatesOnDeviceEvent*/)
{
GMX_ASSERT(false, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct implementation.");
}
return;
}
-// The following method be called after local setCoordinates (which records the coordinatesOnDeviceEvent_
-// event when the coordinate data has been copied to the device).
-void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box)
+void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box,
+ GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
{
//ensure stream waits until coordinate data is available on device
- coordinatesOnDeviceEvent_->enqueueWaitEvent(nonLocalStream_);
+ coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
// launch kernel to pack send buffer
KernelLaunchConfig config;
GpuHaloExchange::Impl::Impl(gmx_domdec_t *dd,
MPI_Comm mpi_comm_mysim,
void * localStream,
- void * nonLocalStream,
- void * coordinatesOnDeviceEvent)
+ void * nonLocalStream)
: dd_(dd),
sendRankX_(dd->neighbor[0][1]),
recvRankX_(dd->neighbor[0][0]),
haloDataTransferLaunched_(new GpuEventSynchronizer()),
mpi_comm_mysim_(mpi_comm_mysim),
localStream_(*static_cast<cudaStream_t*> (localStream)),
- nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream)),
- coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+ nonLocalStream_(*static_cast<cudaStream_t*> (nonLocalStream))
{
GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "GPU Halo exchange is currently only supported with thread-MPI enabled");
GpuHaloExchange::GpuHaloExchange(gmx_domdec_t *dd,
MPI_Comm mpi_comm_mysim,
void *localStream,
- void *nonLocalStream,
- void *coordinatesOnDeviceEvent)
- : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream, coordinatesOnDeviceEvent))
+ void *nonLocalStream)
+ : impl_(new Impl(dd, mpi_comm_mysim, localStream, nonLocalStream))
{
}
impl_->reinitHalo(reinterpret_cast<float3*>(d_coordinatesBuffer), reinterpret_cast<float3*>(d_forcesBuffer));
}
-void GpuHaloExchange::communicateHaloCoordinates(const matrix box)
+void GpuHaloExchange::communicateHaloCoordinates(const matrix box, GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
{
- impl_->communicateHaloCoordinates(box);
+ impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
}
void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
* \param [in] mpi_comm_mysim communicator used for simulation
* \param [in] localStream local NB CUDA stream
* \param [in] nonLocalStream non-local NB CUDA stream
- * \param [in] coordinatesOnDeviceEvent event recorded when coordinates have been copied to device
*/
Impl(gmx_domdec_t *dd,
MPI_Comm mpi_comm_mysim,
void *localStream,
- void *nonLocalStream,
- void *coordinatesOnDeviceEvent);
+ void *nonLocalStream);
~Impl();
/*! \brief
/*! \brief
* GPU halo exchange of coordinates buffer
* \param [in] box Coordinate box (from which shifts will be constructed)
+ * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
*/
- void communicateHaloCoordinates(const matrix box);
+ void communicateHaloCoordinates(const matrix box,
+ GpuEventSynchronizer *coordinatesReadyOnDeviceEvent);
/*! \brief GPU halo exchange of force buffer
* \param[in] accumulateForces True if forces should accumulate, otherwise they are set
cudaStream_t localStream_ = nullptr;
//! CUDA stream for non-local non-bonded calculations
cudaStream_t nonLocalStream_ = nullptr;
- //! Event triggered when coordinates have been copied to device
- GpuEventSynchronizer *coordinatesOnDeviceEvent_ = nullptr;
//! full coordinates buffer in GPU memory
float3 *d_x_ = nullptr;
//! full forces buffer in GPU memory
gmx_bool bEnerVir,
int64_t step, bool useGpuPmePpComms,
bool reinitGpuPmePpComms,
- bool sendCoordinatesFromGpu, gmx_wallcycle *wcycle);
+ bool sendCoordinatesFromGpu,
+ GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle);
/*! \brief Tell our PME-only node to finish */
void gmx_pme_send_finish(const t_commrec *cr);
int maxshift_x, int maxshift_y,
int64_t step, bool useGpuPmePpComms,
bool reinitGpuPmePpComms,
- bool sendCoordinatesFromGpu)
+ bool sendCoordinatesFromGpu,
+ GpuEventSynchronizer *coordinatesReadyOnDeviceEvent)
{
gmx_domdec_t *dd;
gmx_pme_comm_n_box_t *cnb;
{
void *sendPtr = sendCoordinatesFromGpu ? static_cast<void*> (fr->stateGpu->getCoordinates()) :
static_cast<void*> (xRealPtr);
- fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu);
+ fr->pmePpCommGpu->sendCoordinatesToPmeCudaDirect(sendPtr, n, sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
}
else
{
gmx_pme_send_coeffs_coords(nullptr, cr, flags,
chargeA, chargeB,
sqrt_c6A, sqrt_c6B, sigmaA, sigmaB,
- nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false);
+ nullptr, nullptr, 0, 0, maxshift_x, maxshift_y, -1, false, false, false, nullptr);
}
void gmx_pme_send_coordinates(t_forcerec *fr, const t_commrec *cr, const matrix box, const rvec *x,
gmx_bool bEnerVir,
int64_t step, bool useGpuPmePpComms,
bool receiveCoordinateAddressFromPme,
- bool sendCoordinatesFromGpu, gmx_wallcycle *wcycle)
+ bool sendCoordinatesFromGpu,
+ GpuEventSynchronizer *coordinatesReadyOnDeviceEvent, gmx_wallcycle *wcycle)
{
wallcycle_start(wcycle, ewcPP_PMESENDX);
}
gmx_pme_send_coeffs_coords(fr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
box, x, lambda_q, lambda_lj, 0, 0, step, useGpuPmePpComms, receiveCoordinateAddressFromPme,
- sendCoordinatesFromGpu);
+ sendCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
wallcycle_stop(wcycle, ewcPP_PMESENDX);
}
{
unsigned int flags = PP_PME_FINISH;
- gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false);
+ gmx_pme_send_coeffs_coords(nullptr, cr, flags, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, -1, false, false, false, nullptr);
}
void gmx_pme_send_switchgrid(const t_commrec *cr,
#include "gromacs/utility/classhelpers.h"
#include "gromacs/utility/gmxmpi.h"
+class GpuEventSynchronizer;
+
namespace gmx
{
/*! \brief Creates PME-PP GPU communication object
* \param[in] comm Communicator used for simulation
* \param[in] pmeRank Rank of PME task
- * \param[in] coordinatesOnDeviceEvent Event recorded when coordinates are available on device
*/
- PmePpCommGpu(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent);
+ PmePpCommGpu(MPI_Comm comm, int pmeRank);
~PmePpCommGpu();
/*! \brief Perform steps required when buffer size changes
* \param[in] sendPtr Buffer with coordinate data
* \param[in] sendSize Number of elements to send
* \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
+ * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
*/
- void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu);
+ void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
/*! \brief
* Return pointer to buffer used for staging PME force on GPU
};
/*!\brief Constructor stub. */
-PmePpCommGpu::PmePpCommGpu(MPI_Comm gmx_unused comm, int gmx_unused pmeRank, void gmx_unused *coordinatesOnDeviceEvent)
+PmePpCommGpu::PmePpCommGpu(MPI_Comm gmx_unused comm, int gmx_unused pmeRank)
: impl_(nullptr)
{
GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
}
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void gmx_unused *sendPtr, int gmx_unused sendSize, bool gmx_unused sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void gmx_unused *sendPtr, int gmx_unused sendSize, bool gmx_unused sendPmeCoordinatesFromGpu, GpuEventSynchronizer gmx_unused *coordinatesOnDeviceEvent)
{
GMX_ASSERT(false, "A CPU stub for PME-PP GPU communication was called instead of the correct implementation.");
}
namespace gmx
{
-PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent)
+PmePpCommGpu::Impl::Impl(MPI_Comm comm, int pmeRank)
: comm_(comm),
- pmeRank_(pmeRank),
- coordinatesOnDeviceEvent_(static_cast<GpuEventSynchronizer*> (coordinatesOnDeviceEvent))
+ pmeRank_(pmeRank)
{
GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
cudaStreamCreate(&pmePpCommStream_);
}
}
-void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool gmx_unused sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::Impl::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool gmx_unused sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
//ensure stream waits until coordinate data is available on device
- coordinatesOnDeviceEvent_->enqueueWaitEvent(pmePpCommStream_);
+ coordinatesReadyOnDeviceEvent->enqueueWaitEvent(pmePpCommStream_);
cudaError_t stat = cudaMemcpyAsync(remotePmeXBuffer_, sendPtr,
sendSize*DIM*sizeof(float), cudaMemcpyDefault,
return static_cast<void*> (&forcesReadySynchronizer_);
}
-PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent)
- : impl_(new Impl(comm, pmeRank, coordinatesOnDeviceEvent))
+PmePpCommGpu::PmePpCommGpu(MPI_Comm comm, int pmeRank)
+ : impl_(new Impl(comm, pmeRank))
{
}
impl_->receiveForceFromPmeCudaDirect(recvPtr, recvSize, receivePmeForceToGpu);
}
-void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu)
+void PmePpCommGpu::sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
{
- impl_->sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, sendPmeCoordinatesFromGpu);
+ impl_->sendCoordinatesToPmeCudaDirect(sendPtr, sendSize, sendPmeCoordinatesFromGpu, coordinatesReadyOnDeviceEvent);
}
void* PmePpCommGpu::getGpuForceStagingPtr()
/*! \brief Creates PME-PP GPU communication object.
* \param[in] comm Communicator used for simulation
* \param[in] pmeRank Rank of PME task
- * \param[in] coordinatesOnDeviceEvent Event recorded when coordinates are available on device
*/
- Impl(MPI_Comm comm, int pmeRank, void* coordinatesOnDeviceEvent);
+ Impl(MPI_Comm comm, int pmeRank);
~Impl();
/*! \brief Perform steps required when buffer size changes
* \param[in] sendPtr Buffer with coordinate data
* \param[in] sendSize Number of elements to send
* \param[in] sendPmeCoordinatesFromGpu Whether send is from GPU, otherwise CPU
+ * \param[in] coordinatesReadyOnDeviceEvent Event recorded when coordinates are available on device
*/
- void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu);
+ void sendCoordinatesToPmeCudaDirect(void *sendPtr, int sendSize, bool sendPmeCoordinatesFromGpu, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
/*! \brief
* Return pointer to buffer used for staging PME force on GPU
GpuEventSynchronizer forcesReadySynchronizer_;
//! Event recorded when coordinates have been transferred to PME task
GpuEventSynchronizer pmeCoordinatesSynchronizer_;
- //! Event recorded when coordinates have been copied to GPU on this PP task.
- GpuEventSynchronizer *coordinatesOnDeviceEvent_;
};
} // namespace gmx
if (pmeOnlyRankUsesGpu && c_enableGpuPmePpComms)
{
- void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(cr->mpi_comm_mysim,
- cr->dd->pme_nodeid,
- coordinatesOnDeviceEvent);
+ cr->dd->pme_nodeid);
}
}
stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
}
+ const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(AtomLocality::Local,
+ simulationWork, stepWork) : nullptr;
+
#if GMX_MPI
if (!thisRankHasDuty(cr, DUTY_PME))
{
lambda[efptCOUL], lambda[efptVDW],
(stepWork.computeVirial || stepWork.computeEnergy),
step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
- sendCoordinatesFromGpu, wcycle);
+ sendCoordinatesFromGpu, localXReadyOnDevice, wcycle);
}
#endif /* GMX_MPI */
- const auto localXReadyOnDevice = (stateGpu != nullptr) ? stateGpu->getCoordinatesReadyOnDeviceEvent(AtomLocality::Local,
- simulationWork, stepWork) : nullptr;
if (useGpuPmeOnThisRank)
{
launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags,
{
// The following must be called after local setCoordinates (which records an event
// when the coordinate data has been copied to the device).
- gpuHaloExchange->communicateHaloCoordinates(box);
+ gpuHaloExchange->communicateHaloCoordinates(box, localXReadyOnDevice);
if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
{
GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
void *streamLocal = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
void *streamNonLocal = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
- void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
"NOTE: This run uses the 'GPU halo exchange' feature, enabled by the GMX_GPU_DD_COMMS environment variable.");
cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(cr->dd, cr->mpi_comm_mysim, streamLocal,
- streamNonLocal, coordinatesOnDeviceEvent);
+ streamNonLocal);
}
/* Initialize the mdAtoms structure.