From: Alan Gray Date: Thu, 14 Oct 2021 13:29:40 +0000 (+0000) Subject: Pipeline GPU PME Spline/Spread with PP Comms X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?p=alexxy%2Fgromacs.git;a=commitdiff_plain;h=c088e63019ebc68760d35c3bc916015864aa8e89 Pipeline GPU PME Spline/Spread with PP Comms --- diff --git a/docs/release-notes/2022/major/performance.rst b/docs/release-notes/2022/major/performance.rst index 580ef631a3..247715ff23 100644 --- a/docs/release-notes/2022/major/performance.rst +++ b/docs/release-notes/2022/major/performance.rst @@ -26,3 +26,19 @@ up to a factor of 3. :issue:`2875` :issue:`742` + + +PME-PP GPU Direct Communication Pipelining +"""""""""""""""""""""""""""""""""""""" + +For multi-GPU runs with direct PME-PP GPU comunication enabled, the +PME rank can now pipeline the coordinate transfers with computation in +the PME Spread and Spline kernel (where the coordinates are +consumed). The data from each transfer is handled seperately, allowing +computation and communication to be overlapped. This is expected to +have most benefit on systems where hardware communication interfaces +are shared between multiple GPUs, e.g. PCIe within multi-GPU servers +or Infiniband across multiple nodes. + +:issue:`3969` + diff --git a/src/gromacs/ewald/pme.cuh b/src/gromacs/ewald/pme.cuh index af0e258ae9..d166b22653 100644 --- a/src/gromacs/ewald/pme.cuh +++ b/src/gromacs/ewald/pme.cuh @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h index 93145d4e1e..f8653e20dd 100644 --- a/src/gromacs/ewald/pme.h +++ b/src/gromacs/ewald/pme.h @@ -119,6 +119,7 @@ private: std::vector reasons_; }; +class PmeCoordinateReceiverGpu; } // namespace gmx enum @@ -373,17 +374,24 @@ GPU_FUNC_QUALIFIER void pme_gpu_prepare_computation(gmx_pme_t* GPU_FUNC_ARGU /*! \brief * Launches first stage of PME on GPU - spreading kernel. * - * \param[in] pme The PME data structure. - * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates - * are ready in the device memory; nullptr allowed only on separate PME ranks. - * \param[in] wcycle The wallclock counter. - * \param[in] lambdaQ The Coulomb lambda of the current state of the - * system. Only used if FEP of Coulomb is active. + * \param[in] pme The PME data structure. + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates + * are ready in the device memory; nullptr allowed only + * on separate PME ranks. + * \param[in] wcycle The wallclock counter. + * \param[in] lambdaQ The Coulomb lambda of the current state of the + * system. Only used if FEP of Coulomb is active. + * \param[in] useGpuDirectComm Whether direct GPU PME-PP communication is active + * \param[in] pmeCoordinateReceiverGpu Coordinate receiver object, which must be valid when + * direct GPU PME-PP communication is active */ -GPU_FUNC_QUALIFIER void pme_gpu_launch_spread(gmx_pme_t* GPU_FUNC_ARGUMENT(pme), - GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice), - gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle), - real GPU_FUNC_ARGUMENT(lambdaQ)) GPU_FUNC_TERM; +GPU_FUNC_QUALIFIER void pme_gpu_launch_spread( + gmx_pme_t* GPU_FUNC_ARGUMENT(pme), + GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice), + gmx_wallcycle* GPU_FUNC_ARGUMENT(wcycle), + real GPU_FUNC_ARGUMENT(lambdaQ), + const bool GPU_FUNC_ARGUMENT(useGpuDirectComm), + gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM; /*! \brief * Launches middle stages of PME (FFT R2C, solving, FFT C2R) either on GPU or on CPU, depending on the run mode. diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h index 81f640df40..d285c19954 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu.h @@ -49,6 +49,8 @@ #include "gromacs/utility/gmxmpi.h" class DeviceStream; +class DeviceContext; + struct PpRanks; namespace gmx @@ -62,18 +64,30 @@ class PmeCoordinateReceiverGpu public: /*! \brief Creates PME GPU coordinate receiver object - * \param[in] pmeStream CUDA stream used for PME computations + * + * For multi-GPU runs, the PME GPU can receive coordinates from + * multiple PP GPUs. Data from these distinct communications can + * be handled separately in the PME spline/spread kernel, allowing + * pipelining which overlaps computation and communication. The + * class methods are designed to called seperately for each remote + * PP rank, and internally a different stream is used for each + * remote PP rank to allow overlapping. + * * \param[in] comm Communicator used for simulation + * \param[in] deviceContext GPU context * \param[in] ppRanks List of PP ranks */ - PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); + PmeCoordinateReceiverGpu(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef ppRanks); ~PmeCoordinateReceiverGpu(); /*! \brief + * Re-initialize: set atom ranges and, for thread-MPI case, * send coordinates buffer address to PP rank + * This is required after repartitioning since atom ranges and + * buffer allocations may have changed. * \param[in] d_x coordinates buffer in GPU memory */ - void sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x); + void reinitCoordinateReceiver(DeviceBuffer d_x); /*! \brief @@ -92,10 +106,37 @@ public: void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer recvbuf, int numAtoms, int numBytes, int ppRank); /*! \brief - * For lib MPI, wait for coordinates from PP ranks - * For thread MPI, enqueue PP co-ordinate transfer event into PME stream + * For lib MPI, wait for coordinates from any PP rank + * For thread MPI, enqueue PP co-ordinate transfer event received from PP + * rank determined from pipeline stage into given stream + * \param[in] pipelineStage stage of pipeline corresponding to this transfer + * \param[in] deviceStream stream in which to enqueue the wait event. + * \returns rank of sending PP task + */ + int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream); + + /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks, + * enqueueing all events to a single stream + * \param[in] deviceStream stream in which to enqueue the wait events. + */ + void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream); + + /*! \brief + * Return pointer to stream associated with specific PP rank sender index + * \param[in] senderIndex Index of sender PP rank. + */ + DeviceStream* ppCommStream(int senderIndex); + + /*! \brief + * Returns range of atoms involved in communication associated with specific PP rank sender + * index \param[in] senderIndex Index of sender PP rank. + */ + std::tuple ppCommAtomRange(int senderIndex); + + /*! \brief + * Return number of PP ranks involved in PME-PP communication */ - void synchronizeOnCoordinatesFromPpRanks(); + int ppCommNumSenderRanks(); private: class Impl; diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp index 4e997d319b..9d727d14b6 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cpp @@ -62,8 +62,8 @@ class PmeCoordinateReceiverGpu::Impl }; /*!\brief Constructor stub. */ -PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStream */, - MPI_Comm /* comm */, +PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm /* comm */, + const DeviceContext& /* deviceContext */, gmx::ArrayRef /* ppRanks */) : impl_(nullptr) { @@ -75,7 +75,7 @@ PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& /* pmeStr PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default; /*!\brief init PME-PP GPU communication stub */ -void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer /* d_x */) +void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer /* d_x */) { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication initialization was called instead of the " @@ -99,13 +99,47 @@ void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffe "implementation."); } -void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks() +int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int /* pipelineStage */, + const DeviceStream& /* deviceStream */) { GMX_ASSERT(!impl_, "A CPU stub for PME-PP GPU communication was called instead of the correct " "implementation."); + return 0; } +void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& /* deviceStream */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for PME-PP GPU communication was called instead of the correct " + "implementation."); +} + +DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int /* senderIndex */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for PME-PP GPU communication was called instead of the correct " + "implementation."); + return nullptr; +} + +std::tuple PmeCoordinateReceiverGpu::ppCommAtomRange(int /* senderIndex */) +{ + GMX_ASSERT(!impl_, + "A CPU stub for PME-PP GPU communication was called instead of the correct " + "implementation."); + return std::make_tuple(0, 0); +} + +int PmeCoordinateReceiverGpu::ppCommNumSenderRanks() +{ + GMX_ASSERT(!impl_, + "A CPU stub for PME-PP GPU communication was called instead of the correct " + "implementation."); + return 0; +} + + } // namespace gmx #endif // !GMX_GPU_CUDA diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu index 9de7c6676f..5b1fa48ade 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.cu @@ -56,33 +56,42 @@ namespace gmx { -PmeCoordinateReceiverGpu::Impl::Impl(const DeviceStream& pmeStream, - MPI_Comm comm, - gmx::ArrayRef ppRanks) : - pmeStream_(pmeStream), comm_(comm), ppRanks_(ppRanks) +PmeCoordinateReceiverGpu::Impl::Impl(MPI_Comm comm, + const DeviceContext& deviceContext, + gmx::ArrayRef ppRanks) : + comm_(comm), requests_(ppRanks.size(), MPI_REQUEST_NULL), deviceContext_(deviceContext) { - request_.resize(ppRanks.size()); - ppSync_.resize(ppRanks.size()); + // Create streams to manage pipelining + ppCommManagers_.reserve(ppRanks.size()); + for (auto& ppRank : ppRanks) + { + ppCommManagers_.emplace_back(PpCommManager{ + ppRank, + std::make_unique(deviceContext_, DeviceStreamPriority::High, false), + nullptr, + { 0, 0 } }); + } } PmeCoordinateReceiverGpu::Impl::~Impl() = default; -void PmeCoordinateReceiverGpu::Impl::sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x) +void PmeCoordinateReceiverGpu::Impl::reinitCoordinateReceiver(DeviceBuffer d_x) { - // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy - if (GMX_THREAD_MPI) + int indEnd = 0; + for (auto& ppCommManager : ppCommManagers_) { - int ind_start = 0; - int ind_end = 0; - for (const auto& receiver : ppRanks_) - { - ind_start = ind_end; - ind_end = ind_start + receiver.numAtoms; + int indStart = indEnd; + indEnd = indStart + ppCommManager.ppRank.numAtoms; + ppCommManager.atomRange = std::make_tuple(indStart, indEnd); + + // Need to send address to PP rank only for thread-MPI as PP rank pushes data using cudamemcpy + if (GMX_THREAD_MPI) + { // Data will be transferred directly from GPU. - void* sendBuf = reinterpret_cast(&d_x[ind_start]); + void* sendBuf = reinterpret_cast(&d_x[indStart]); #if GMX_MPI - MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, receiver.rankId, 0, comm_); + MPI_Send(&sendBuf, sizeof(void**), MPI_BYTE, ppCommManager.ppRank.rankId, 0, comm_); #else GMX_UNUSED_VALUE(sendBuf); #endif @@ -102,8 +111,13 @@ void PmeCoordinateReceiverGpu::Impl::receiveCoordinatesSynchronizerFromPpCudaDir #if GMX_MPI // Receive event from PP task // NOLINTNEXTLINE(bugprone-sizeof-expression) - MPI_Irecv(&ppSync_[recvCount_], sizeof(GpuEventSynchronizer*), MPI_BYTE, ppRank, 0, comm_, &request_[recvCount_]); - recvCount_++; + MPI_Irecv(&ppCommManagers_[ppRank].sync, + sizeof(GpuEventSynchronizer*), + MPI_BYTE, + ppRank, + 0, + comm_, + &(requests_[ppRank])); #else GMX_UNUSED_VALUE(ppRank); #endif @@ -119,7 +133,7 @@ void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(Devic "launchReceiveCoordinatesFromPpCudaMpi is expected to be called only for Lib-MPI"); #if GMX_MPI - MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &request_[recvCount_++]); + MPI_Irecv(&recvbuf[numAtoms], numBytes, MPI_BYTE, ppRank, eCommType_COORD_GPU, comm_, &(requests_[ppRank])); #else GMX_UNUSED_VALUE(recvbuf); GMX_UNUSED_VALUE(numAtoms); @@ -128,42 +142,70 @@ void PmeCoordinateReceiverGpu::Impl::launchReceiveCoordinatesFromPpCudaMpi(Devic #endif } -void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRanks() +int PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromPpRank(int pipelineStage, + const DeviceStream& deviceStream) { - if (recvCount_ > 0) - { - // ensure PME calculation doesn't commence until coordinate data/remote events - // has been transferred #if GMX_MPI - MPI_Waitall(recvCount_, request_.data(), MPI_STATUS_IGNORE); + int senderRank = -1; // Rank of PP task that is associated with this invocation. +# if (!GMX_THREAD_MPI) + // Wait on data from any one of the PP sender GPUs + MPI_Waitany(requests_.size(), requests_.data(), &senderRank, MPI_STATUS_IGNORE); + GMX_ASSERT(senderRank >= 0, "Rank of sending PP task must be 0 or greater"); + GMX_UNUSED_VALUE(pipelineStage); + GMX_UNUSED_VALUE(deviceStream); +# else + // MPI_Waitany is not available in thread-MPI. However, the + // MPI_Wait here is not associated with data but is host-side + // scheduling code to receive a CUDA event, and will be executed + // in advance of the actual data transfer. Therefore we can + // receive in order of pipeline stage, still allowing the + // scheduled GPU-direct comms to initiate out-of-order in their + // respective streams. For cases with CPU force computations, the + // scheduling is less asynchronous (done on a per-step basis), so + // host-side improvements should be investigated as tracked in + // issue #4047 + senderRank = pipelineStage; + MPI_Wait(&(requests_[senderRank]), MPI_STATUS_IGNORE); + ppCommManagers_[senderRank].sync->enqueueWaitEvent(deviceStream); +# endif + return senderRank; #endif +} - // Make PME stream wait on PP to PME data trasnfer events - if (GMX_THREAD_MPI) - { - for (int i = 0; i < recvCount_; i++) - { - ppSync_[i]->enqueueWaitEvent(pmeStream_); - } - } - - // reset receive counter - recvCount_ = 0; +void PmeCoordinateReceiverGpu::Impl::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream) +{ + for (int i = 0; i < static_cast(ppCommManagers_.size()); i++) + { + synchronizeOnCoordinatesFromPpRank(i, deviceStream); } } +DeviceStream* PmeCoordinateReceiverGpu::Impl::ppCommStream(int senderIndex) +{ + return ppCommManagers_[senderIndex].stream.get(); +} -PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(const DeviceStream& pmeStream, - MPI_Comm comm, +std::tuple PmeCoordinateReceiverGpu::Impl::ppCommAtomRange(int senderIndex) +{ + return ppCommManagers_[senderIndex].atomRange; +} + +int PmeCoordinateReceiverGpu::Impl::ppCommNumSenderRanks() +{ + return ppCommManagers_.size(); +} + +PmeCoordinateReceiverGpu::PmeCoordinateReceiverGpu(MPI_Comm comm, + const DeviceContext& deviceContext, gmx::ArrayRef ppRanks) : - impl_(new Impl(pmeStream, comm, ppRanks)) + impl_(new Impl(comm, deviceContext, ppRanks)) { } PmeCoordinateReceiverGpu::~PmeCoordinateReceiverGpu() = default; -void PmeCoordinateReceiverGpu::sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x) +void PmeCoordinateReceiverGpu::reinitCoordinateReceiver(DeviceBuffer d_x) { - impl_->sendCoordinateBufferAddressToPpRanks(d_x); + impl_->reinitCoordinateReceiver(d_x); } void PmeCoordinateReceiverGpu::receiveCoordinatesSynchronizerFromPpCudaDirect(int ppRank) @@ -179,9 +221,31 @@ void PmeCoordinateReceiverGpu::launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffe impl_->launchReceiveCoordinatesFromPpCudaMpi(recvbuf, numAtoms, numBytes, ppRank); } -void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRanks() +int PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromPpRank(int senderIndex, + const DeviceStream& deviceStream) { - impl_->synchronizeOnCoordinatesFromPpRanks(); + return impl_->synchronizeOnCoordinatesFromPpRank(senderIndex, deviceStream); } +void PmeCoordinateReceiverGpu::synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream) +{ + impl_->synchronizeOnCoordinatesFromAllPpRanks(deviceStream); +} + +DeviceStream* PmeCoordinateReceiverGpu::ppCommStream(int senderIndex) +{ + return impl_->ppCommStream(senderIndex); +} + +std::tuple PmeCoordinateReceiverGpu::ppCommAtomRange(int senderIndex) +{ + return impl_->ppCommAtomRange(senderIndex); +} + +int PmeCoordinateReceiverGpu::ppCommNumSenderRanks() +{ + return impl_->ppCommNumSenderRanks(); +} + + } // namespace gmx diff --git a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h index 604079c0b0..d268091771 100644 --- a/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h +++ b/src/gromacs/ewald/pme_coordinate_receiver_gpu_impl.h @@ -52,25 +52,41 @@ class GpuEventSynchronizer; namespace gmx { -/*! \internal \brief Class with interfaces and data for CUDA version of PME coordinate receiving functionality */ +/*! \brief Object to manage communications with a specific PP rank */ +struct PpCommManager +{ + //! Details of PP rank that may be updated after repartitioning + const PpRanks& ppRank; + //! Stream used communication with for PP rank + std::unique_ptr stream; + //! Synchronization event to receive from PP rank + GpuEventSynchronizer* sync = nullptr; + //! Range of atoms corresponding to PP rank + std::tuple atomRange = { 0, 0 }; +}; + +/*! \internal \brief Class with interfaces and data for CUDA version of PME coordinate receiving functionality */ class PmeCoordinateReceiverGpu::Impl { public: /*! \brief Creates PME GPU coordinate receiver object - * \param[in] pmeStream CUDA stream used for PME computations * \param[in] comm Communicator used for simulation + * \param[in] deviceContext GPU context * \param[in] ppRanks List of PP ranks */ - Impl(const DeviceStream& pmeStream, MPI_Comm comm, gmx::ArrayRef ppRanks); + Impl(MPI_Comm comm, const DeviceContext& deviceContext, gmx::ArrayRef ppRanks); ~Impl(); /*! \brief - * send coordinates buffer address to PP rank + * Re-initialize: set atom ranges and, for thread-MPI case, + * send coordinates buffer address to PP rank. + * This is required after repartitioning since atom ranges and + * buffer allocations may have changed. * \param[in] d_x coordinates buffer in GPU memory */ - void sendCoordinateBufferAddressToPpRanks(DeviceBuffer d_x); + void reinitCoordinateReceiver(DeviceBuffer d_x); /*! \brief * Receive coordinate synchronizer pointer from the PP ranks. @@ -88,24 +104,47 @@ public: void launchReceiveCoordinatesFromPpCudaMpi(DeviceBuffer recvbuf, int numAtoms, int numBytes, int ppRank); /*! \brief - * For lib MPI, wait for coordinates from PP ranks - * For thread MPI, enqueue PP co-ordinate transfer event into PME stream + * For lib MPI, wait for coordinates from any PP rank + * For thread MPI, enqueue PP co-ordinate transfer event received from PP + * rank determined from pipeline stage into given stream + * \param[in] pipelineStage stage of pipeline corresponding to this transfer + * \param[in] deviceStream stream in which to enqueue the wait event. + * \returns rank of sending PP task + */ + int synchronizeOnCoordinatesFromPpRank(int pipelineStage, const DeviceStream& deviceStream); + + /*! \brief Perform above synchronizeOnCoordinatesFromPpRanks for all PP ranks, + * enqueueing all events to a single stream + * \param[in] deviceStream stream in which to enqueue the wait events. + */ + void synchronizeOnCoordinatesFromAllPpRanks(const DeviceStream& deviceStream); + + /*! \brief + * Return pointer to stream associated with specific PP rank sender index + * \param[in] senderIndex Index of sender PP rank. + */ + DeviceStream* ppCommStream(int senderIndex); + + /*! \brief + * Returns range of atoms involved in communication associated with specific PP rank sender + * index \param[in] senderIndex Index of sender PP rank. + */ + std::tuple ppCommAtomRange(int senderIndex); + + /*! \brief + * Return number of PP ranks involved in PME-PP communication */ - void synchronizeOnCoordinatesFromPpRanks(); + int ppCommNumSenderRanks(); private: - //! CUDA stream for PME operations - const DeviceStream& pmeStream_; //! communicator for simulation MPI_Comm comm_; - //! list of PP ranks - gmx::ArrayRef ppRanks_; - //! vector of MPI requests - std::vector request_; - //! vector of synchronization events to receive from PP tasks - std::vector ppSync_; - //! counter of messages to receive - int recvCount_ = 0; + //! MPI requests, one per PP rank + std::vector requests_; + //! GPU context handle (not used in CUDA) + const DeviceContext& deviceContext_; + //! Communication manager objects corresponding to multiple sending PP ranks + std::vector ppCommManagers_; }; } // namespace gmx diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp index 084bb3f6cc..680fcec0fc 100644 --- a/src/gromacs/ewald/pme_gpu.cpp +++ b/src/gromacs/ewald/pme_gpu.cpp @@ -59,6 +59,7 @@ #include "gromacs/utility/fatalerror.h" #include "gromacs/utility/gmxassert.h" #include "gromacs/utility/stringutil.h" +#include "gromacs/ewald/pme_coordinate_receiver_gpu.h" #include "pme_gpu_internal.h" #include "pme_gpu_settings.h" @@ -189,10 +190,12 @@ void pme_gpu_prepare_computation(gmx_pme_t* pme, } } -void pme_gpu_launch_spread(gmx_pme_t* pme, - GpuEventSynchronizer* xReadyOnDevice, - gmx_wallcycle* wcycle, - const real lambdaQ) +void pme_gpu_launch_spread(gmx_pme_t* pme, + GpuEventSynchronizer* xReadyOnDevice, + gmx_wallcycle* wcycle, + const real lambdaQ, + const bool useGpuDirectComm, + gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu) { GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled."); GMX_ASSERT(!GMX_GPU_CUDA || xReadyOnDevice || !pme->bPPnode, @@ -215,7 +218,8 @@ void pme_gpu_launch_spread(gmx_pme_t* pme, const bool spreadCharges = true; wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu); wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme); - pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ); + pme_gpu_spread( + pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu); wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme); wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu); } diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp index 96dc1f4db9..2f7da67a3a 100644 --- a/src/gromacs/ewald/pme_gpu_internal.cpp +++ b/src/gromacs/ewald/pme_gpu_internal.cpp @@ -75,6 +75,7 @@ #include "gromacs/utility/logger.h" #include "gromacs/utility/stringutil.h" #include "gromacs/ewald/pme.h" +#include "gromacs/ewald/pme_coordinate_receiver_gpu.h" #if GMX_GPU_CUDA # include "pme.cuh" @@ -593,7 +594,10 @@ static void pme_gpu_init_internal(PmeGpu* pmeGpu, const DeviceContext& deviceCon */ #if GMX_GPU_CUDA - pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0]; + pmeGpu->kernelParams->usePipeline = false; + pmeGpu->kernelParams->pipelineAtomStart = 0; + pmeGpu->kernelParams->pipelineAtomEnd = 0; + pmeGpu->maxGridWidthX = deviceContext.deviceInfo().prop.maxGridSize[0]; #else // Use this path for any non-CUDA GPU acceleration // TODO: is there no really global work size limit in OpenCL? @@ -1276,12 +1280,14 @@ static auto selectSpreadKernelPtr(const PmeGpu* pmeGpu, return kernelPtr; } -void pme_gpu_spread(const PmeGpu* pmeGpu, - GpuEventSynchronizer* xReadyOnDevice, - real** h_grids, - bool computeSplines, - bool spreadCharges, - const real lambda) +void pme_gpu_spread(const PmeGpu* pmeGpu, + GpuEventSynchronizer* xReadyOnDevice, + real** h_grids, + bool computeSplines, + bool spreadCharges, + const real lambda, + const bool useGpuDirectComm, + gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu) { GMX_ASSERT( pmeGpu->common->ngrids == 1 || pmeGpu->common->ngrids == 2, @@ -1350,6 +1356,7 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, PmeStage timingId; PmeGpuProgramImpl::PmeKernelHandle kernelPtr = nullptr; + const bool writeGlobalOrSaveSplines = writeGlobal || (!recalculateSplines); if (computeSplines) { if (spreadCharges) @@ -1357,7 +1364,7 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, timingId = PmeStage::SplineAndSpread; kernelPtr = selectSplineAndSpreadKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines), + writeGlobalOrSaveSplines, pmeGpu->common->ngrids); } else @@ -1365,43 +1372,116 @@ void pme_gpu_spread(const PmeGpu* pmeGpu, timingId = PmeStage::Spline; kernelPtr = selectSplineKernelPtr(pmeGpu, pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines), + writeGlobalOrSaveSplines, pmeGpu->common->ngrids); } } else { timingId = PmeStage::Spread; - kernelPtr = selectSpreadKernelPtr(pmeGpu, - pmeGpu->settings.threadsPerAtom, - writeGlobal || (!recalculateSplines), - pmeGpu->common->ngrids); + kernelPtr = selectSpreadKernelPtr( + pmeGpu, pmeGpu->settings.threadsPerAtom, writeGlobalOrSaveSplines, pmeGpu->common->ngrids); } pme_gpu_start_timing(pmeGpu, timingId); auto* timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId); + + kernelParamsPtr->usePipeline = computeSplines && spreadCharges && useGpuDirectComm + && (pmeCoordinateReceiverGpu->ppCommNumSenderRanks() > 1) + && !writeGlobalOrSaveSplines; + if (kernelParamsPtr->usePipeline) + { + int numStagesInPipeline = pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); + + for (int i = 0; i < numStagesInPipeline; i++) + { + int senderRank; + if (useGpuDirectComm) + { + senderRank = pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRank( + i, *(pmeCoordinateReceiverGpu->ppCommStream(i))); + } + else + { + senderRank = i; + } + + // set kernel configuration options specific to this stage of the pipeline + std::tie(kernelParamsPtr->pipelineAtomStart, kernelParamsPtr->pipelineAtomEnd) = + pmeCoordinateReceiverGpu->ppCommAtomRange(senderRank); + const int blockCount = static_cast(std::ceil( + static_cast(kernelParamsPtr->pipelineAtomEnd - kernelParamsPtr->pipelineAtomStart) + / atomsPerBlock)); + auto dimGrid = pmeGpuCreateGrid(pmeGpu, blockCount); + config.gridSize[0] = dimGrid.first; + config.gridSize[1] = dimGrid.second; + DeviceStream* launchStream = pmeCoordinateReceiverGpu->ppCommStream(senderRank); + + #if c_canEmbedBuffers - const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); #else - const auto kernelArgs = - prepareGpuKernelArguments(kernelPtr, - config, - kernelParamsPtr, - &kernelParamsPtr->atoms.d_theta, - &kernelParamsPtr->atoms.d_dtheta, - &kernelParamsPtr->atoms.d_gridlineIndices, - &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], - &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B], - &kernelParamsPtr->grid.d_fractShiftsTable, - &kernelParamsPtr->grid.d_gridlineIndicesTable, - &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A], - &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], - &kernelParamsPtr->atoms.d_coordinates); + const auto kernelArgs = + prepareGpuKernelArguments(kernelPtr, + config, + kernelParamsPtr, + &kernelParamsPtr->atoms.d_theta, + &kernelParamsPtr->atoms.d_dtheta, + &kernelParamsPtr->atoms.d_gridlineIndices, + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B], + &kernelParamsPtr->grid.d_fractShiftsTable, + &kernelParamsPtr->grid.d_gridlineIndicesTable, + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A], + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], + &kernelParamsPtr->atoms.d_coordinates); #endif - launchGpuKernel( - kernelPtr, config, pmeGpu->archSpecific->pmeStream_, timingEvent, "PME spline/spread", kernelArgs); + launchGpuKernel(kernelPtr, config, *launchStream, timingEvent, "PME spline/spread", kernelArgs); + } + // Set dependencies for PME stream on all pipeline streams + for (int i = 0; i < pmeCoordinateReceiverGpu->ppCommNumSenderRanks(); i++) + { + GpuEventSynchronizer event; + event.markEvent(*(pmeCoordinateReceiverGpu->ppCommStream(i))); + event.enqueueWaitEvent(pmeGpu->archSpecific->pmeStream_); + } + } + else // pipelining is not in use + { + if (useGpuDirectComm) // Sync all PME-PP communications to PME stream + { + pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromAllPpRanks(pmeGpu->archSpecific->pmeStream_); + } + +#if c_canEmbedBuffers + const auto kernelArgs = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr); +#else + const auto kernelArgs = + prepareGpuKernelArguments(kernelPtr, + config, + kernelParamsPtr, + &kernelParamsPtr->atoms.d_theta, + &kernelParamsPtr->atoms.d_dtheta, + &kernelParamsPtr->atoms.d_gridlineIndices, + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_A], + &kernelParamsPtr->grid.d_realGrid[FEP_STATE_B], + &kernelParamsPtr->grid.d_fractShiftsTable, + &kernelParamsPtr->grid.d_gridlineIndicesTable, + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_A], + &kernelParamsPtr->atoms.d_coefficients[FEP_STATE_B], + &kernelParamsPtr->atoms.d_coordinates); +#endif + + launchGpuKernel(kernelPtr, + config, + pmeGpu->archSpecific->pmeStream_, + timingEvent, + "PME spline/spread", + kernelArgs); + } + pme_gpu_stop_timing(pmeGpu, timingId); const auto& settings = pmeGpu->settings; diff --git a/src/gromacs/ewald/pme_gpu_internal.h b/src/gromacs/ewald/pme_gpu_internal.h index 7baa6bd347..0a6ee2a5d2 100644 --- a/src/gromacs/ewald/pme_gpu_internal.h +++ b/src/gromacs/ewald/pme_gpu_internal.h @@ -340,21 +340,30 @@ void pme_gpu_destroy_3dfft(const PmeGpu* pmeGpu); /*! \libinternal \brief * A GPU spline computation and charge spreading function. * - * \param[in] pmeGpu The PME GPU structure. - * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are ready in the device memory; - * can be nullptr when invoked on a separate PME rank or from PME tests. - * \param[out] h_grids The host-side grid buffers (used only if the result of the spread is expected on the host, - * e.g. testing or host-side FFT) - * \param[in] computeSplines Should the computation of spline parameters and gridline indices be performed. - * \param[in] spreadCharges Should the charges/coefficients be spread on the grid. - * \param[in] lambda The lambda value of the current system state. - */ -GPU_FUNC_QUALIFIER void pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), - GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice), - float** GPU_FUNC_ARGUMENT(h_grids), - bool GPU_FUNC_ARGUMENT(computeSplines), - bool GPU_FUNC_ARGUMENT(spreadCharges), - real GPU_FUNC_ARGUMENT(lambda)) GPU_FUNC_TERM; + * \param[in] pmeGpu The PME GPU structure. + * \param[in] xReadyOnDevice Event synchronizer indicating that the coordinates are + * ready in the device memory; can be nullptr when invoked + * on a separate PME rank or from PME tests. + * \param[out] h_grids The host-side grid buffers (used only if the result + * of the spread is expected on the host, e.g. testing + * or host-side FFT) + * \param[in] computeSplines Should the computation of spline parameters and gridline + * indices be performed. + * \param[in] spreadCharges Should the charges/coefficients be spread on the grid. + * \param[in] lambda The lambda value of the current system state. + * \param[in] useGpuDirectComm Whether direct GPU PME-PP communication is active + * \param[in] pmeCoordinateReceiverGpu Coordinate receiver object, which must be valid when + * direct GPU PME-PP communication is active + */ +GPU_FUNC_QUALIFIER void +pme_gpu_spread(const PmeGpu* GPU_FUNC_ARGUMENT(pmeGpu), + GpuEventSynchronizer* GPU_FUNC_ARGUMENT(xReadyOnDevice), + float** GPU_FUNC_ARGUMENT(h_grids), + bool GPU_FUNC_ARGUMENT(computeSplines), + bool GPU_FUNC_ARGUMENT(spreadCharges), + real GPU_FUNC_ARGUMENT(lambda), + const bool GPU_FUNC_ARGUMENT(useGpuDirectComm), + gmx::PmeCoordinateReceiverGpu* GPU_FUNC_ARGUMENT(pmeCoordinateReceiverGpu)) GPU_FUNC_TERM; /*! \libinternal \brief * 3D FFT R2C/C2R routine. diff --git a/src/gromacs/ewald/pme_gpu_types.h b/src/gromacs/ewald/pme_gpu_types.h index e2c067390a..76e111e765 100644 --- a/src/gromacs/ewald/pme_gpu_types.h +++ b/src/gromacs/ewald/pme_gpu_types.h @@ -224,6 +224,15 @@ struct PmeGpuKernelParamsBase * before launching spreading. */ struct PmeGpuDynamicParams current; + + /*! \brief Whether pipelining with PP communications is active + * char rather than bool to avoid problem with OpenCL compiler */ + char usePipeline; + /*! \brief Start atom for this stage of pipeline */ + int pipelineAtomStart; + /*! \brief End atom for this stage of pipeline */ + int pipelineAtomEnd; + /* These texture objects are only used in CUDA and are related to the grid size. */ /*! \brief Texture object for accessing grid.d_fractShiftsTable */ HIDE_FROM_OPENCL_COMPILER(DeviceTexture) fractShiftsTableTexture; diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp index 3151ec1ca7..56002966be 100644 --- a/src/gromacs/ewald/pme_only.cpp +++ b/src/gromacs/ewald/pme_only.cpp @@ -218,6 +218,9 @@ static gmx_pme_t* gmx_pmeonly_switch(std::vector* pmedata, } /*! \brief Called by PME-only ranks to receive coefficients and coordinates + * + * Note that with GPU direct communication the transfer is only initiated, it is the responsibility + * of the caller to synchronize prior to launching spread. * * \param[in] pme PME data structure. * \param[in,out] pme_pp PME-PP communication structure. @@ -438,9 +441,8 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, "GPU Direct PME-PP communication has been enabled, " "but PME run mode is not PmeRunMode::GPU\n"); - // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses. - pme_pp->pmeCoordinateReceiverGpu->sendCoordinateBufferAddressToPpRanks( - stateGpu->getCoordinates()); + // This rank will have its data accessed directly by PP rank, so needs to send the remote addresses and re-set atom ranges associated with transfers. + pme_pp->pmeCoordinateReceiverGpu->reinitCoordinateReceiver(stateGpu->getCoordinates()); pme_pp->pmeForceSenderGpu->setForceSendBuffer(pme_gpu_get_device_f(pme)); } } @@ -495,11 +497,6 @@ static int gmx_pme_recv_coeffs_coords(struct gmx_pme_t* pme, } } - if (pme_pp->useGpuDirectComm) - { - pme_pp->pmeCoordinateReceiverGpu->synchronizeOnCoordinatesFromPpRanks(); - } - status = pmerecvqxX; } @@ -673,9 +670,7 @@ int gmx_pmeonly(struct gmx_pme_t* pme, if (useGpuPmePpCommunication) { pme_pp->pmeCoordinateReceiverGpu = std::make_unique( - deviceStreamManager->stream(gmx::DeviceStreamType::Pme), - pme_pp->mpi_comm_mysim, - pme_pp->ppRanks); + pme_pp->mpi_comm_mysim, deviceStreamManager->context(), pme_pp->ppRanks); pme_pp->pmeForceSenderGpu = std::make_unique(pme_gpu_get_f_ready_synchronizer(pme), pme_pp->mpi_comm_mysim, @@ -775,7 +770,12 @@ int gmx_pmeonly(struct gmx_pme_t* pme, // TODO: with pme on GPU the receive should make a list of synchronizers and pass it here #3157 auto xReadyOnDevice = nullptr; - pme_gpu_launch_spread(pme, xReadyOnDevice, wcycle, lambda_q); + pme_gpu_launch_spread(pme, + xReadyOnDevice, + wcycle, + lambda_q, + pme_pp->useGpuDirectComm, + pme_pp->pmeCoordinateReceiverGpu.get()); pme_gpu_launch_complex_transforms(pme, wcycle, stepWork); pme_gpu_launch_gather(pme, wcycle, lambda_q); output = pme_gpu_wait_finish_task(pme, computeEnergyAndVirial, lambda_q, wcycle); diff --git a/src/gromacs/ewald/pme_spread.cu b/src/gromacs/ewald/pme_spread.cu index d0856602a9..f5ba345181 100644 --- a/src/gromacs/ewald/pme_spread.cu +++ b/src/gromacs/ewald/pme_spread.cu @@ -200,7 +200,7 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU float atomCharge; const int blockIndex = blockIdx.y * gridDim.x + blockIdx.x; - const int atomIndexOffset = blockIndex * atomsPerBlock; + const int atomIndexOffset = blockIndex * atomsPerBlock + kernelParams.pipelineAtomStart; /* Thread index w.r.t. block */ const int threadLocalId = @@ -225,8 +225,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU /* Charges, required for both spline and spread */ if (c_useAtomDataPrefetch) { - pme_gpu_stage_atom_data(sm_coefficients, - kernelParams.atoms.d_coefficients[0]); + pme_gpu_stage_atom_data( + sm_coefficients, &kernelParams.atoms.d_coefficients[0][kernelParams.pipelineAtomStart]); __syncthreads(); atomCharge = sm_coefficients[atomIndexLocal]; } @@ -237,7 +237,8 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU if (computeSplines) { - const float3* __restrict__ gm_coordinates = asFloat3(kernelParams.atoms.d_coordinates); + const float3* __restrict__ gm_coordinates = + asFloat3(&kernelParams.atoms.d_coordinates[kernelParams.pipelineAtomStart]); if (c_useAtomDataPrefetch) { // Coordinates @@ -274,8 +275,12 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU /* Spreading */ if (spreadCharges) { - spread_charges( - kernelParams, &atomCharge, sm_gridlineIndices, sm_theta); + + if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd)) + { + spread_charges( + kernelParams, &atomCharge, sm_gridlineIndices, sm_theta); + } } if (numGrids == 2) { @@ -293,8 +298,11 @@ __launch_bounds__(c_spreadMaxThreadsPerBlock) CLANG_DISABLE_OPTIMIZATION_ATTRIBU } if (spreadCharges) { - spread_charges( - kernelParams, &atomCharge, sm_gridlineIndices, sm_theta); + if (!kernelParams.usePipeline || (atomIndexGlobal < kernelParams.pipelineAtomEnd)) + { + spread_charges( + kernelParams, &atomCharge, sm_gridlineIndices, sm_theta); + } } } } diff --git a/src/gromacs/ewald/tests/pmetestcommon.cpp b/src/gromacs/ewald/tests/pmetestcommon.cpp index a3c4409fca..0016cd8024 100644 --- a/src/gromacs/ewald/tests/pmetestcommon.cpp +++ b/src/gromacs/ewald/tests/pmetestcommon.cpp @@ -70,6 +70,7 @@ #include "gromacs/utility/gmxassert.h" #include "gromacs/utility/logger.h" #include "gromacs/utility/stringutil.h" +#include "gromacs/ewald/pme_coordinate_receiver_gpu.h" #include "testutils/test_hardware_environment.h" #include "testutils/testasserts.h" @@ -342,7 +343,18 @@ void pmePerformSplineAndSpread(gmx_pme_t* pme, const real lambdaQ = 1.0; // no synchronization needed as x is transferred in the PME stream GpuEventSynchronizer* xReadyOnDevice = nullptr; - pme_gpu_spread(pme->gpu, xReadyOnDevice, fftgrid, computeSplines, spreadCharges, lambdaQ); + + bool useGpuDirectComm = false; + gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr; + + pme_gpu_spread(pme->gpu, + xReadyOnDevice, + fftgrid, + computeSplines, + spreadCharges, + lambdaQ, + useGpuDirectComm, + pmeCoordinateReceiverGpu); } break; #endif diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index 241cc3b9fc..a2ccaed47b 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -54,6 +54,7 @@ #include "gromacs/domdec/partition.h" #include "gromacs/essentialdynamics/edsam.h" #include "gromacs/ewald/pme.h" +#include "gromacs/ewald/pme_coordinate_receiver_gpu.h" #include "gromacs/ewald/pme_pp.h" #include "gromacs/ewald/pme_pp_comm_gpu.h" #include "gromacs/gmxlib/network.h" @@ -746,7 +747,10 @@ static inline void launchPmeGpuSpread(gmx_pme_t* pmedata, gmx_wallcycle* wcycle) { pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork); - pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ); + bool useGpuDirectComm = false; + gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr; + pme_gpu_launch_spread( + pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu); } /*! \brief Launch the FFT and gather stages of PME GPU