ppRanks_(ppRanks),
ppCommStream_(ppRanks.size()),
ppCommEvent_(ppRanks.size()),
+ ppCommEventRecorded_(ppRanks.size()),
deviceContext_(deviceContext),
pmeRemoteCpuForcePtr_(ppRanks.size()),
pmeRemoteGpuForcePtr_(ppRanks.size())
MPI_Recv(&pmeRemoteGpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
// NOLINTNEXTLINE(bugprone-sizeof-expression)
MPI_Recv(&pmeRemoteCpuForcePtr_[i], sizeof(float3*), MPI_BYTE, receiver.rankId, 0, comm_, MPI_STATUS_IGNORE);
+ // Send address of event and associated flag to PP rank, to allow remote enqueueing
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
+ MPI_Send(&ppCommEvent_[i], sizeof(GpuEventSynchronizer*), MPI_BYTE, receiver.rankId, 0, comm_);
+
+ std::atomic<bool>* tmpPpCommEventRecordedPtr =
+ reinterpret_cast<std::atomic<bool>*>(&(ppCommEventRecorded_[i]));
+ tmpPpCommEventRecordedPtr->store(false, std::memory_order_release);
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
+ MPI_Send(&tmpPpCommEventRecordedPtr, sizeof(std::atomic<bool>*), MPI_BYTE, receiver.rankId, 0, comm_);
i++;
}
ppCommStream_[ppRank]->stream());
CU_RET_ERR(stat, "cudaMemcpyAsync on Recv from PME CUDA direct data transfer failed");
ppCommEvent_[ppRank]->markEvent(*ppCommStream_[ppRank]);
- // NOLINTNEXTLINE(bugprone-sizeof-expression)
- MPI_Send(&ppCommEvent_[ppRank], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_);
+ std::atomic<bool>* tmpPpCommEventRecordedPtr =
+ reinterpret_cast<std::atomic<bool>*>(&(ppCommEventRecorded_[ppRank]));
+ tmpPpCommEventRecordedPtr->store(true, std::memory_order_release);
#else
GMX_UNUSED_VALUE(ppRank);
GMX_UNUSED_VALUE(numAtoms);
#ifndef GMX_PMEFORCESENDERGPU_IMPL_H
#define GMX_PMEFORCESENDERGPU_IMPL_H
+#include <atomic>
+#include <new>
+
#include "gromacs/ewald/pme_force_sender_gpu.h"
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/gpu_utils/gputraits.h"
#include "gromacs/utility/arrayref.h"
+// Portable definition of cache line size
+#ifdef __cpp_lib_hardware_interference_size
+using std::hardware_destructive_interference_size;
+#else
+constexpr std::size_t hardware_destructive_interference_size = 64;
+#endif
+
class GpuEventSynchronizer;
namespace gmx
/*! \internal \brief Class with interfaces and data for CUDA version of PME Force sending functionality*/
+typedef struct CacheLineAlignedFlag
+{
+ alignas(hardware_destructive_interference_size) bool flag;
+} CacheLineAlignedFlag;
+
class PmeForceSenderGpu::Impl
{
std::vector<std::unique_ptr<DeviceStream>> ppCommStream_;
//! Events used for manging sync with remote PP ranks
std::vector<std::unique_ptr<GpuEventSynchronizer>> ppCommEvent_;
+ //! Vector of flags to track when PP transfer events have been recorded
+ std::vector<std::atomic<CacheLineAlignedFlag>> ppCommEventRecorded_;
//! Addresses of local force buffers to send to remote PP ranks
std::vector<DeviceBuffer<RVec>> localForcePtr_;
//! GPU context handle (not used in CUDA)
#include "gromacs/utility/smalloc.h"
#include "pme_gpu_internal.h"
+#include "pme_internal.h"
#include "pme_output.h"
#include "pme_pp_communication.h"
int messages, ind_start, ind_end;
cve.cycles = cycles;
- /* Now the evaluated forces have to be transferred to the PP nodes */
+ if (pme_pp->useGpuDirectComm)
+ {
+ GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
+ "The use of GPU direct communication for PME-PP is enabled, "
+ "but the PME GPU force reciever object does not exist");
+ }
+
messages = 0;
ind_end = 0;
- for (const auto& receiver : pme_pp->ppRanks)
+
+ /* Now the evaluated forces have to be transferred to the PP ranks */
+ if (pme_pp->useGpuDirectComm && GMX_THREAD_MPI)
{
- ind_start = ind_end;
- ind_end = ind_start + receiver.numAtoms;
- if (pme_pp->useGpuDirectComm)
+ int numPpRanks = static_cast<int>(pme_pp->ppRanks.size());
+# pragma omp parallel for num_threads(std::min(numPpRanks, pme.nthread)) schedule(static)
+ for (int i = 0; i < numPpRanks; i++)
{
- GMX_ASSERT((pme_pp->pmeForceSenderGpu != nullptr),
- "The use of GPU direct communication for PME-PP is enabled, "
- "but the PME GPU force reciever object does not exist");
-
- if (GMX_THREAD_MPI)
- {
- pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
- receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
- }
- else
+ auto& receiver = pme_pp->ppRanks[i];
+ pme_pp->pmeForceSenderGpu->sendFToPpCudaDirect(
+ receiver.rankId, receiver.numAtoms, pme_pp->sendForcesDirectToPpGpu);
+ }
+ }
+ else
+ {
+ for (const auto& receiver : pme_pp->ppRanks)
+ {
+ ind_start = ind_end;
+ ind_end = ind_start + receiver.numAtoms;
+ if (pme_pp->useGpuDirectComm)
{
pme_pp->pmeForceSenderGpu->sendFToPpCudaMpi(pme_gpu_get_device_f(&pme),
ind_start,
receiver.numAtoms * sizeof(rvec),
receiver.rankId,
&pme_pp->req[messages]);
-
- messages++;
}
- }
- else
- {
- void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
- // Send using MPI
- MPI_Isend(sendbuf,
- receiver.numAtoms * sizeof(rvec),
- MPI_BYTE,
- receiver.rankId,
- 0,
- pme_pp->mpi_comm_mysim,
- &pme_pp->req[messages]);
+ else
+ {
+ void* sendbuf = const_cast<void*>(static_cast<const void*>(output.forces_[ind_start]));
+ // Send using MPI
+ MPI_Isend(sendbuf,
+ receiver.numAtoms * sizeof(rvec),
+ MPI_BYTE,
+ receiver.rankId,
+ 0,
+ pme_pp->mpi_comm_mysim,
+ &pme_pp->req[messages]);
+ }
messages++;
}
}
MPI_Send(&d_pmeForces_, sizeof(float3*), MPI_BYTE, pmeRank_, 0, comm_);
RVec* pmeCpuForceBufferData = pmeCpuForceBuffer_->data();
MPI_Send(&pmeCpuForceBufferData, sizeof(RVec*), MPI_BYTE, pmeRank_, 0, comm_);
+ // Receive address of event and associated flag from PME rank, to allow sync to local stream after force transfer
+ MPI_Recv(&remotePmeForceSendEvent_, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
+ MPI_Recv(&remotePmeForceSendEventRecorded_, sizeof(std::atomic<bool>*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
}
#endif
}
-// TODO make this asynchronous by splitting into this into
-// launchRecvForceFromPmeCudaDirect() and sycnRecvForceFromPmeCudaDirect()
void PmePpCommGpu::Impl::receiveForceFromPmeCudaDirect(bool receivePmeForceToGpu)
{
#if GMX_MPI
- // Remote PME task pushes GPU data directly data to this PP task.
+ // Wait until remote PME task has pushed data, and then enqueue remote event to local stream.
- // Recieve event from PME task after PME->PP force data push has
- // been scheduled and enqueue this to PP stream.
- GpuEventSynchronizer* eventptr;
- // NOLINTNEXTLINE(bugprone-sizeof-expression)
- MPI_Recv(&eventptr, sizeof(GpuEventSynchronizer*), MPI_BYTE, pmeRank_, 0, comm_, MPI_STATUS_IGNORE);
- eventptr->enqueueWaitEvent(pmePpCommStream_);
+ // Spin until PME rank sets flag
+ while (!(remotePmeForceSendEventRecorded_->load(std::memory_order_acquire))) {};
+
+ // Enqueue remote event
+ remotePmeForceSendEvent_->enqueueWaitEvent(pmePpCommStream_);
+
+ // Reset the flag
+ remotePmeForceSendEventRecorded_->store(false, std::memory_order_release);
if (receivePmeForceToGpu)
{
#ifndef GMX_PME_PP_COMM_GPU_IMPL_H
#define GMX_PME_PP_COMM_GPU_IMPL_H
+#include <atomic>
+
#include "gromacs/ewald/pme_pp_comm_gpu.h"
#include "gromacs/gpu_utils/gpueventsynchronizer.h"
#include "gromacs/math/vectypes.h"
GpuEventSynchronizer forcesReadySynchronizer_;
//! Event recorded when coordinates have been transferred to PME task
GpuEventSynchronizer pmeCoordinatesSynchronizer_;
+ //! Event recorded by remote PME task when forces have been transferred
+ GpuEventSynchronizer* remotePmeForceSendEvent_;
+ //! Flag to track when remote PP event has been recorded, ready for enqueueing
+ volatile std::atomic<bool>* remotePmeForceSendEventRecorded_;
};
} // namespace gmx