/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
*/
#include "gmxpre.h"
+#include "gromacs/ewald/pme_pp_communication.h"
#include "pme_coordinate_receiver_gpu_impl.h"
#include "config.h"
-#include <assert.h>
-#include <stdio.h>
-
-#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_force_sender_gpu.h"
#include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#include "gromacs/gpu_utils/gpueventsynchronizer.h"
#include "gromacs/utility/gmxmpi.h"
namespace gmx
{
-PmeCoordinateReceiverGpu::Impl::Impl(void *pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks)
- : pmeStream_(*static_cast<cudaStream_t*> (pmeStream)),
- comm_(comm),
- ppRanks_(ppRanks)
+PmeCoordinateReceiverGpu::Impl::Impl(MPI_Comm comm,
+ const DeviceContext& deviceContext,
+ gmx::ArrayRef<const PpRanks> ppRanks) :
+ comm_(comm), requests_(ppRanks.size(), MPI_REQUEST_NULL), deviceContext_(deviceContext)
{
- GMX_RELEASE_ASSERT(GMX_THREAD_MPI, "PME-PP GPU Communication is currently only supported with thread-MPI enabled");
+ // Create streams to manage pipelining
+ ppCommManagers_.reserve(ppRanks.size());
+ for (auto& ppRank : ppRanks)
+ {
+ ppCommManagers_.emplace_back(PpCommManager{
+ ppRank,
+ std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::High, false),
+ nullptr,
+ { 0, 0 } });
+ }
}
PmeCoordinateReceiverGpu::Impl::~Impl() = default;
-void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(rvec *d_x)
+void PmeCoordinateReceiverGpu::Impl::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
{
-
- int ind_start = 0;
- int ind_end = 0;
- for (const auto &receiver : ppRanks_)
+ int indEnd = 0;
+ for (auto& ppCommManager : ppCommManagers_)
{
- ind_start = ind_end;
- ind_end = ind_start + receiver.numAtoms;
+ int indStart = indEnd;
+ indEnd = indStart + ppCommManager.ppRank.numAtoms;
- //Data will be transferred directly from GPU.
- void* sendBuf = reinterpret_cast<void*> (&d_x[ind_start]);
+ ppCommManager.atomRange = std::make_tuple(indStart, indEnd);
+ // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy
+ if (GMX_THREAD_MPI)
+ {
+ // Data will be transferred directly from GPU.
+ void* sendBuf = reinterpret_cast<void*>(&d_x[indStart]);
#if GMX_MPI
- MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId,
- 0, comm_);
+ MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, ppCommManager.ppRank.rankId, 0, comm_);
+#else
+ GMX_UNUSED_VALUE(sendBuf);
#endif
-
+ }
}
}
-/*! \brief Receive coordinate data directly using CUDA memory copy */
-void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesFromPpCudaDirect(int ppRank)
+/*! \brief Receive coordinate synchronizer pointer from the PP ranks. */
+void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
{
+ GMX_ASSERT(GMX_THREAD_MPI,
+ "receiveCoordinatesSynchronizerFromPpCudaDirect is expected to be called only for "
+ "Thread-MPI");
+
// Data will be pushed directly from PP task
#if GMX_MPI
- // Receive event from PP task and add to PME stream, to ensure PME calculation doesn't
- // commence until coordinate data has been transferred
- GpuEventSynchronizer *ppSync;
- MPI_Recv(&ppSync, sizeof(GpuEventSynchronizer*),
- MPI_BYTE, ppRank, 0,
- comm_, MPI_STATUS_IGNORE);
- ppSync->enqueueWaitEvent(pmeStream_);
+ // Receive event from PP task
+ // NOLINTNEXTLINE(bugprone-sizeof-expression)
+ MPI_Irecv(&ppCommManagers_[ppRank].sync,
+ sizeof(GpuEventSynchronizer*),
+ MPI_BYTE,
+ ppRank,
+ 0,
+ comm_,
+ &(requests_[ppRank]));
+#else
+ GMX_UNUSED_VALUE(ppRank);
#endif
}
-PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(void *pmeStream, MPI_Comm comm, gmx::ArrayRef<PpRanks> ppRanks)
- : impl_(new Impl(pmeStream, comm, ppRanks))
+/*! \brief Receive coordinate data using CUDA-aware MPI */
+void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+ int numAtoms,
+ int numBytes,
+ int ppRank)
+{
+ GMX_ASSERT(GMX_LIB_MPI,
+ "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI");
+
+#if GMX_MPI
+ MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &(requests_[ppRank]));
+#else
+ GMX_UNUSED_VALUE(recvbuf);
+ GMX_UNUSED_VALUE(numAtoms);
+ GMX_UNUSED_VALUE(numBytes);
+ GMX_UNUSED_VALUE(ppRank);
+#endif
+}
+
+int PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRank(int pipelineStage,
+ const DeviceStream& deviceStream)
+{
+#if GMX_MPI
+ int senderRank = -1; // Rank of PP task that is associated with this invocation.
+# if (!GMX_THREAD_MPI)
+ // Wait on data from any one of the PP sender GPUs
+ MPI_Waitany(requests_.size(), requests_.data(), &senderRank, MPI_STATUS_IGNORE);
+ GMX_ASSERT(senderRank >= 0, "Rank of sending PP task must be 0 or greater");
+ GMX_UNUSED_VALUE(pipelineStage);
+ GMX_UNUSED_VALUE(deviceStream);
+# else
+ // MPI_Waitany is not available in thread-MPI. However, the
+ // MPI_Wait here is not associated with data but is host-side
+ // scheduling code to receive a CUDA event, and will be executed
+ // in advance of the actual data transfer. Therefore we can
+ // receive in order of pipeline stage, still allowing the
+ // scheduled GPU-direct comms to initiate out-of-order in their
+ // respective streams. For cases with CPU force computations, the
+ // scheduling is less asynchronous (done on a per-step basis), so
+ // host-side improvements should be investigated as tracked in
+ // issue #4047
+ senderRank = pipelineStage;
+ MPI_Wait(&(requests_[senderRank]), MPI_STATUS_IGNORE);
+ ppCommManagers_[senderRank].sync->enqueueWaitEvent(deviceStream);
+# endif
+ return senderRank;
+#endif
+}
+
+void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+ for (int i = 0; i < static_cast<int>(ppCommManagers_.size()); i++)
+ {
+ synchronizeOnCoordinatesFromPpRank(i, deviceStream);
+ }
+}
+DeviceStream* PmeCoordinateReceiverGpu::Impl::ppCommStream(int senderIndex)
+{
+ return ppCommManagers_[senderIndex].stream.get();
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::Impl::ppCommAtomRange(int senderIndex)
+{
+ return ppCommManagers_[senderIndex].atomRange;
+}
+
+int PmeCoordinateReceiverGpu::Impl::ppCommNumSenderRanks()
+{
+ return ppCommManagers_.size();
+}
+
+PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm comm,
+ const DeviceContext& deviceContext,
+ gmx::ArrayRef<PpRanks> ppRanks) :
+ impl_(new Impl(comm, deviceContext, ppRanks))
{
}
PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default;
-void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(rvec* d_x)
+void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer<RVec> d_x)
+{
+ impl_->reinitCoordinateReceiver(d_x);
+}
+
+void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank)
+{
+ impl_->receiveCoordinatesSynchronizerFromPpCudaDirect(ppRank);
+}
+
+void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer<RVec> recvbuf,
+ int numAtoms,
+ int numBytes,
+ int ppRank)
+{
+ impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank);
+}
+
+int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int senderIndex,
+ const DeviceStream& deviceStream)
+{
+ return impl_->synchronizeOnCoordinatesFromPpRank(senderIndex, deviceStream);
+}
+
+void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream)
+{
+ impl_->synchronizeOnCoordinatesFromAllPpRanks(deviceStream);
+}
+
+DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int senderIndex)
+{
+ return impl_->ppCommStream(senderIndex);
+}
+
+std::tuple<int, int> PmeCoordinateReceiverGpu::ppCommAtomRange(int senderIndex)
{
- impl_->sendCoordinateBufferAddressToPpRanks(d_x);
+ return impl_->ppCommAtomRange(senderIndex);
}
-void PmeCoordinateReceiverGpu::receiveCoordinatesFromPpCudaDirect(int ppRank)
+int PmeCoordinateReceiverGpu::ppCommNumSenderRanks()
{
- impl_->receiveCoordinatesFromPpCudaDirect(ppRank);
+ return impl_->ppCommNumSenderRanks();
}
-} //namespace gmx
+
+} // namespace gmx