From: Alan Gray Date: Wed, 6 Oct 2021 09:26:43 +0000 (+0000) Subject: Rework GPU halo and state propagator streams and dependencies to get better overlap X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=ccd5bef5f2afb57b15efcdacd9b2b3fedbd5edef;p=alexxy%2Fgromacs.git Rework GPU halo and state propagator streams and dependencies to get better overlap --- diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp index 8a634b7c07..f063f3b0e7 100644 --- a/src/gromacs/domdec/domdec.cpp +++ b/src/gromacs/domdec/domdec.cpp @@ -3205,14 +3205,7 @@ void constructGpuHaloExchange(const gmx::MDLogger& mdlog, for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++) { cr.dd->gpuHaloExchange[d].push_back(std::make_unique( - cr.dd, - d, - cr.mpi_comm_mygroup, - deviceStreamManager.context(), - deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal), - deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal), - pulse, - wcycle)); + cr.dd, d, cr.mpi_comm_mygroup, deviceStreamManager.context(), pulse, wcycle)); } } } @@ -3230,26 +3223,31 @@ void reinitGpuHaloExchange(const t_commrec& cr, } } -void communicateGpuHaloCoordinates(const t_commrec& cr, - const matrix box, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec& cr, + const matrix box, + GpuEventSynchronizer* dependencyEvent) { + GpuEventSynchronizer* eventPtr = dependencyEvent; for (int d = 0; d < cr.dd->ndim; d++) { for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++) { - cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent); + eventPtr = cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, eventPtr); } } + return eventPtr; } -void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces) +void communicateGpuHaloForces(const t_commrec& cr, + bool accumulateForces, + gmx::FixedCapacityVector* dependencyEvents) { for (int d = cr.dd->ndim - 1; d >= 0; d--) { for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--) { - cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces); + cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces, dependencyEvents); + dependencyEvents->push_back(cr.dd->gpuHaloExchange[d][pulse]->getForcesReadyOnDeviceEvent()); } } } diff --git a/src/gromacs/domdec/domdec.h b/src/gromacs/domdec/domdec.h index bb964de98e..c7d7483cbe 100644 --- a/src/gromacs/domdec/domdec.h +++ b/src/gromacs/domdec/domdec.h @@ -92,6 +92,8 @@ class RangePartitioning; class VirtualSitesHandler; template class ArrayRef; +template +class FixedCapacityVector; } // namespace gmx /*! \brief Returns the global topology atom number belonging to local atom index i. @@ -265,20 +267,24 @@ void reinitGpuHaloExchange(const t_commrec& cr, /*! \brief GPU halo exchange of coordinates buffer. - * \param [in] cr The commrec object - * \param [in] box Coordinate box (from which shifts will be constructed) - * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device + * \param [in] cr The commrec object + * \param [in] box Coordinate box (from which shifts will be constructed) + * \param [in] dependencyEvent Dependency event for this operation + * \returns Event recorded when this operation has been launched */ -void communicateGpuHaloCoordinates(const t_commrec& cr, - const matrix box, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); - +GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec& cr, + const matrix box, + GpuEventSynchronizer* dependencyEvent); -/*! \brief GPU halo exchange of force buffer. - * \param [in] cr The commrec object +/*! \brief Wait for copy of nonlocal part of coordinate array from GPU to CPU + * following coordinate halo exchange + * \param [in] cr The commrec object * \param [in] accumulateForces True if forces should accumulate, otherwise they are set + * \param [in] dependencyEvents Dependency events for this operation */ -void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces); +void communicateGpuHaloForces(const t_commrec& cr, + bool accumulateForces, + gmx::FixedCapacityVector* dependencyEvents); /*! \brief Wraps the \c positions so that atoms from the same * update group share the same periodic image wrt \c box. diff --git a/src/gromacs/domdec/gpuhaloexchange.h b/src/gromacs/domdec/gpuhaloexchange.h index adc8d66712..431407b6bc 100644 --- a/src/gromacs/domdec/gpuhaloexchange.h +++ b/src/gromacs/domdec/gpuhaloexchange.h @@ -46,7 +46,9 @@ #include "gromacs/gpu_utils/devicebuffer_datatype.h" #include "gromacs/math/vectypes.h" +#include "gromacs/utility/arrayref.h" #include "gromacs/utility/basedefinitions.h" +#include "gromacs/utility/fixedcapacityvector.h" #include "gromacs/utility/gmxmpi.h" struct gmx_domdec_t; @@ -66,27 +68,26 @@ class GpuHaloExchange public: /*! \brief Creates GPU Halo Exchange object. * - * Coordinate Halo exchange will be performed in \c - * StreamNonLocal, and the \c communicateHaloCoordinates - * method must be called before any subsequent operations that - * access non-local parts of the coordinate buffer (such as - * the non-local non-bonded kernels). It also must be called - * after the local coordinates buffer operations (where the - * coordinates are copied to the device and hence the \c - * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange - * will be performed in \c streamNonLocal and the \c - * communicateHaloForces method must be called after the - * non-local buffer operations, after the local force buffer - * has been copied to the GPU (if CPU forces are present), and - * before the local buffer operations. The force halo exchange - * does not yet support virial steps. + * Coordinate Halo exchange will be performed in its own stream + * with appropriate event-based synchronization, and the \c + * communicateHaloCoordinates method must be called before any + * subsequent operations that access non-local parts of the + * coordinate buffer (such as the non-local non-bonded + * kernels). It also must be called after the local coordinates + * buffer operations (where the coordinates are copied to the + * device and hence the \c coordinatesReadyOnDeviceEvent is + * recorded). Force Halo exchange will also be performed in its + * own stream with appropriate event-based synchronization, and + * the \c communicateHaloForces method must be called after the + * non-local buffer operations, after the local force buffer has + * been copied to the GPU (if CPU forces are present), and before + * the local buffer operations. The force halo exchange does not + * yet support virial steps. * * \param [inout] dd domdec structure * \param [in] dimIndex the dimension index for this instance * \param [in] mpi_comm_mysim communicator used for simulation * \param [in] deviceContext GPU device context - * \param [in] streamLocal local NB CUDA stream. - * \param [in] streamNonLocal non-local NB CUDA stream. * \param [in] pulse the communication pulse for this instance * \param [in] wcycle The wallclock counter */ @@ -94,8 +95,6 @@ public: int dimIndex, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, - const DeviceStream& streamLocal, - const DeviceStream& streamNonLocal, int pulse, gmx_wallcycle* wcycle); ~GpuHaloExchange(); @@ -116,15 +115,18 @@ public: * Must be called after local setCoordinates (which records an * event when the coordinate data has been copied to the * device). - * \param [in] box Coordinate box (from which shifts will be constructed) - * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device + * \param [in] box Coordinate box (from which shifts will be constructed) + * \param [in] dependencyEvent Dependency event for this operation + * \returns Event recorded when this operation has been launched */ - void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + GpuEventSynchronizer* communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* dependencyEvent); /*! \brief GPU halo exchange of force buffer. * \param[in] accumulateForces True if forces should accumulate, otherwise they are set + * \param[in] dependencyEvents Dependency events for this operation */ - void communicateHaloForces(bool accumulateForces); + void communicateHaloForces(bool accumulateForces, + FixedCapacityVector* dependencyEvents); /*! \brief Get the event synchronizer for the forces ready on device. * \returns The event to synchronize the stream that consumes forces on device. diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cpp b/src/gromacs/domdec/gpuhaloexchange_impl.cpp index e00d4e3d60..5a839b27bf 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cpp +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -67,8 +67,6 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* /* dd */, int /* dimIndex */, MPI_Comm /* mpi_comm_mysim */, const DeviceContext& /* deviceContext */, - const DeviceStream& /*streamLocal */, - const DeviceStream& /*streamNonLocal */, int /*pulse */, gmx_wallcycle* /*wcycle*/) : impl_(nullptr) @@ -96,16 +94,18 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer /* d_coordinatesBuffer */, } /*!\brief apply X halo exchange stub. */ -void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */, - GpuEventSynchronizer* /*coordinatesOnDeviceEvent*/) +GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */, + GpuEventSynchronizer* /*dependencyEvent*/) { GMX_ASSERT(!impl_, "A CPU stub for GPU Halo Exchange exchange was called insted of the correct " "implementation."); + return nullptr; } /*!\brief apply F halo exchange stub. */ -void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces) +void GpuHaloExchange::communicateHaloForces(bool /* accumulateForces */, + FixedCapacityVector* /*dependencyEvents*/) { GMX_ASSERT(!impl_, "A CPU stub for GPU Halo Exchange was called insted of the correct implementation."); diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu index 5796b96980..b0dab24e8a 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cu +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu @@ -200,7 +200,7 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin()); copyToDeviceBuffer( - &d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr); + &d_indexMap_, h_indexMap_.data(), 0, newSize, *haloStream_, GpuApiCallBehavior::Async, nullptr); } #if GMX_MPI @@ -270,19 +270,16 @@ void GpuHaloExchange::Impl::enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynch 0, mpi_comm_mysim_, MPI_STATUS_IGNORE); - remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_); + remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(*haloStream_); } -void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +GpuEventSynchronizer* GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box, + GpuEventSynchronizer* dependencyEvent) { - wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu); - if (pulse_ == 0) - { - // ensure stream waits until coordinate data is available on device - coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_); - } + + // ensure stream waits until dependency has been satisfied + dependencyEvent->enqueueWaitEvent(*haloStream_); wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveX); @@ -318,8 +315,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box const auto kernelArgs = prepareGpuKernelArguments( kernelFn, config, &sendBuf, &d_x, &indexMap, &size, &coordinateShift); - launchGpuKernel( - kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs); + launchGpuKernel(kernelFn, config, *haloStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs); } wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveX); @@ -331,28 +327,41 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated - if (GMX_THREAD_MPI && pulse_ == 0) + if (GMX_THREAD_MPI && dimIndex_ == 0 && pulse_ == 0) { - enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent); + enqueueWaitRemoteCoordinatesReadyEvent(dependencyEvent); } float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_]; communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_); + coordinateHaloLaunched_.markEvent(*haloStream_); + wallcycle_stop(wcycle_, WallCycleCounter::MoveX); + + return &coordinateHaloLaunched_; } // The following method should be called after non-local buffer operations, -// and before the local buffer operations. It operates in the non-local stream. -void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) +// and before the local buffer operations. +void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces, + FixedCapacityVector* dependencyEvents) { + // Consider time spent in communicateHaloData as Comm.F counter // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync wallcycle_start(wcycle_, WallCycleCounter::MoveF); + while (dependencyEvents->size() > 0) + { + auto dependency = dependencyEvents->back(); + dependency->enqueueWaitEvent(*haloStream_); + dependencyEvents->pop_back(); + } + float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_; - // Communicate halo data (in non-local stream) + // Communicate halo data communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_); wallcycle_stop(wcycle_, WallCycleCounter::MoveF); @@ -361,19 +370,6 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveF); float3* d_f = d_f_; - // If this is the last pulse and index (noting the force halo - // exchanges across multiple pulses and indices are called in - // reverse order) then perform the following preparation - // activities - if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1))) - { - // ensure non-local stream waits for local stream, due to dependence on - // the previous H2D copy of CPU forces (if accumulateForces is true) - // or local force clearing. - GpuEventSynchronizer eventLocal; - eventLocal.markEvent(localStream_); - eventLocal.enqueueWaitEvent(nonLocalStream_); - } // Unpack halo buffer into force array @@ -405,14 +401,10 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces) const auto kernelArgs = prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size); - launchGpuKernel( - kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs); + launchGpuKernel(kernelFn, config, *haloStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs); } - if (pulse_ == 0) - { - fReadyOnDevice_.markEvent(nonLocalStream_); - } + fReadyOnDevice_.markEvent(*haloStream_); wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveF); wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); @@ -447,12 +439,12 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaMPI(float3* sendPtr, // no need to wait for haloDataReadyOnDevice event if this rank is not sending any data if (sendSize > 0) { - // wait for non local stream to complete all outstanding + // wait for halo stream to complete all outstanding // activities, to ensure that buffer is up-to-date in GPU memory // before transferring to remote rank // ToDo: Replace stream synchronize with event synchronize - nonLocalStream_.synchronize(); + haloStream_->synchronize(); } // perform halo exchange directly in device buffers @@ -491,7 +483,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr, sendPtr, sendSize * DIM * sizeof(float), cudaMemcpyDeviceToDevice, - nonLocalStream_.stream()); + haloStream_->stream()); CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed"); } @@ -506,7 +498,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr, GMX_ASSERT(haloDataTransferLaunched_ != nullptr, "Halo exchange requires valid event to synchronize data transfer initiated in " "remote rank"); - haloDataTransferLaunched_->markEvent(nonLocalStream_); + haloDataTransferLaunched_->markEvent(*haloStream_); MPI_Sendrecv(&haloDataTransferLaunched_, sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression) @@ -521,7 +513,7 @@ void GpuHaloExchange::Impl::communicateHaloDataWithCudaDirect(float3* sendPtr, mpi_comm_mysim_, MPI_STATUS_IGNORE); - haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_); + haloDataTransferRemote->enqueueWaitEvent(*haloStream_); #else GMX_UNUSED_VALUE(sendRank); GMX_UNUSED_VALUE(recvRank); @@ -538,8 +530,6 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, int dimIndex, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, - const DeviceStream& localStream, - const DeviceStream& nonLocalStream, int pulse, gmx_wallcycle* wcycle) : dd_(dd), @@ -551,8 +541,7 @@ GpuHaloExchange::Impl::Impl(gmx_domdec_t* dd, haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr), mpi_comm_mysim_(mpi_comm_mysim), deviceContext_(deviceContext), - localStream_(localStream), - nonLocalStream_(nonLocalStream), + haloStream_(new DeviceStream(deviceContext, DeviceStreamPriority::High, false)), dimIndex_(dimIndex), pulse_(pulse), wcycle_(wcycle) @@ -580,11 +569,9 @@ GpuHaloExchange::GpuHaloExchange(gmx_domdec_t* dd, int dimIndex, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, - const DeviceStream& localStream, - const DeviceStream& nonLocalStream, int pulse, gmx_wallcycle* wcycle) : - impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle)) + impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, pulse, wcycle)) { } @@ -603,15 +590,16 @@ void GpuHaloExchange::reinitHalo(DeviceBuffer d_coordinatesBuffer, DeviceB impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer)); } -void GpuHaloExchange::communicateHaloCoordinates(const matrix box, - GpuEventSynchronizer* coordinatesReadyOnDeviceEvent) +GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix box, + GpuEventSynchronizer* dependencyEvent) { - impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent); + return impl_->communicateHaloCoordinates(box, dependencyEvent); } -void GpuHaloExchange::communicateHaloForces(bool accumulateForces) +void GpuHaloExchange::communicateHaloForces(bool accumulateForces, + FixedCapacityVector* dependencyEvents) { - impl_->communicateHaloForces(accumulateForces); + impl_->communicateHaloForces(accumulateForces, dependencyEvents); } GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent() diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cuh b/src/gromacs/domdec/gpuhaloexchange_impl.cuh index 6190a56b95..c29834616a 100644 --- a/src/gromacs/domdec/gpuhaloexchange_impl.cuh +++ b/src/gromacs/domdec/gpuhaloexchange_impl.cuh @@ -75,8 +75,6 @@ public: * \param [in] dimIndex the dimension index for this instance * \param [in] mpi_comm_mysim communicator used for simulation * \param [in] deviceContext GPU device context - * \param [in] localStream local NB CUDA stream - * \param [in] nonLocalStream non-local NB CUDA stream * \param [in] pulse the communication pulse for this instance * \param [in] wcycle The wallclock counter */ @@ -84,8 +82,6 @@ public: int dimIndex, MPI_Comm mpi_comm_mysim, const DeviceContext& deviceContext, - const DeviceStream& localStream, - const DeviceStream& nonLocalStream, int pulse, gmx_wallcycle* wcycle); ~Impl(); @@ -101,14 +97,17 @@ public: /*! \brief * GPU halo exchange of coordinates buffer * \param [in] box Coordinate box (from which shifts will be constructed) - * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device + * \param [in] dependencyEvent Dependency event for this operation + * \returns Event recorded when this operation has been launched */ - void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); + GpuEventSynchronizer* communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* dependencyEvent); /*! \brief GPU halo exchange of force buffer - * \param[in] accumulateForces True if forces should accumulate, otherwise they are set + * \param [in] accumulateForces True if forces should accumulate, otherwise they are set + * \param [in] dependencyEvents Dependency events for this operation */ - void communicateHaloForces(bool accumulateForces); + void communicateHaloForces(bool accumulateForces, + FixedCapacityVector* dependencyEvents); /*! \brief Get the event synchronizer for the forces ready on device. * \returns The event to synchronize the stream that consumes forces on device. @@ -150,8 +149,8 @@ private: int recvSize, int recvRank); - /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in non-local - * stream \param [in] eventSync event recorded when coordinates/forces are ready to device + /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in halo stream + * \param [in] eventSync event recorded when coordinates/forces are ready to device */ void enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent); @@ -211,10 +210,8 @@ private: MPI_Comm mpi_comm_mysim_; //! GPU context object const DeviceContext& deviceContext_; - //! CUDA stream for local non-bonded calculations - const DeviceStream& localStream_; - //! CUDA stream for non-local non-bonded calculations - const DeviceStream& nonLocalStream_; + //! CUDA stream for this halo exchange + DeviceStream* haloStream_; //! full coordinates buffer in GPU memory float3* d_x_ = nullptr; //! full forces buffer in GPU memory @@ -229,6 +226,8 @@ private: gmx_wallcycle* wcycle_ = nullptr; //! The atom offset for receive (x) or send (f) for dimension index and pulse corresponding to this halo exchange instance int atomOffset_ = 0; + //! Event triggered when coordinate halo has been launched + GpuEventSynchronizer coordinateHaloLaunched_; }; } // namespace gmx diff --git a/src/gromacs/domdec/tests/haloexchange_mpi.cpp b/src/gromacs/domdec/tests/haloexchange_mpi.cpp index 1f1138fb36..63659998ef 100644 --- a/src/gromacs/domdec/tests/haloexchange_mpi.cpp +++ b/src/gromacs/domdec/tests/haloexchange_mpi.cpp @@ -150,8 +150,8 @@ void gpuHalo(gmx_domdec_t* dd, matrix box, HostVector* h_x, int numAtomsTo { for (int pulse = 0; pulse < dd->comm->cd[d].numPulses(); pulse++) { - gpuHaloExchange[d].push_back(GpuHaloExchange( - dd, d, MPI_COMM_WORLD, deviceContext, deviceStream, deviceStream, pulse, nullptr)); + gpuHaloExchange[d].push_back( + GpuHaloExchange(dd, d, MPI_COMM_WORLD, deviceContext, pulse, nullptr)); } } diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp index ee3e8cdd46..241cc3b9fc 100644 --- a/src/gromacs/mdlib/sim_util.cpp +++ b/src/gromacs/mdlib/sim_util.cpp @@ -1132,12 +1132,13 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork || runScheduleWork->simulationWork.havePpDomainDecomposition; const int atomStart = 0; - fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(), - nbv->getNumAtoms(AtomLocality::Local), - nbv->getGridIndices(), - atomStart, - accumulate, - stateGpu->fReducedOnDevice()); + fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit( + stateGpu->getForces(), + nbv->getNumAtoms(AtomLocality::Local), + nbv->getGridIndices(), + atomStart, + accumulate, + stateGpu->fReducedOnDevice(AtomLocality::Local)); // register forces and add dependencies fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv)); @@ -1174,15 +1175,12 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, } } - if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange) + if (runScheduleWork->domainWork.haveCpuLocalForceWork + || (runScheduleWork->simulationWork.havePpDomainDecomposition + && !runScheduleWork->simulationWork.useGpuHaloExchange)) { - // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed - if (!runScheduleWork->simulationWork.havePpDomainDecomposition) - { - const bool useGpuForceBufferOps = true; - fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency( - stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::All, useGpuForceBufferOps)); - } + fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency( + stateGpu->fReadyOnDevice(AtomLocality::Local)); } if (runScheduleWork->simulationWork.useGpuHaloExchange) @@ -1197,16 +1195,23 @@ static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork, const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork; const int atomStart = dd_numHomeAtoms(*cr->dd); - fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(stateGpu->getForces(), - nbv->getNumAtoms(AtomLocality::NonLocal), - nbv->getGridIndices(), - atomStart, - accumulate); + fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit( + stateGpu->getForces(), + nbv->getNumAtoms(AtomLocality::NonLocal), + nbv->getGridIndices(), + atomStart, + accumulate, + stateGpu->fReducedOnDevice(AtomLocality::NonLocal)); // register forces and add dependencies - // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce( Nbnxm::gpu_get_f(nbv->gpu_nbv)); + + if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer) + { + fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency( + stateGpu->fReadyOnDevice(AtomLocality::NonLocal)); + } } } @@ -1266,6 +1271,17 @@ void do_force(FILE* fplog, runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork); const StepWorkload& stepWork = runScheduleWork->stepWork; + if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork) + { + // GPU Force halo exchange will set a subset of local atoms with remote non-local data + // First clear local portion of force array, so that untouched atoms are zero. + // The dependency for this is that forces from previous timestep have been consumed, + // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked. + stateGpu->clearForcesOnGpu(AtomLocality::Local, + stateGpu->getCoordinatesReadyOnDeviceEvent( + AtomLocality::Local, simulationWork, stepWork)); + } + /* At a search step we need to start the first balancing region * somewhere early inside the step after communication during domain * decomposition (and not during the previous step as usual). @@ -1579,16 +1595,18 @@ void do_force(FILE* fplog, } else { + GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr; if (stepWork.useGpuXHalo) { // The following must be called after local setCoordinates (which records an event // when the coordinate data has been copied to the device). - communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice); + gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice); if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork) { // non-local part of coordinate buffer must be copied back to host for CPU work - stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal); + stateGpu->copyCoordinatesFromGpu( + x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched); } } else @@ -1608,10 +1626,11 @@ void do_force(FILE* fplog, { stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal); } - nbv->convertCoordinatesGpu(AtomLocality::NonLocal, - stateGpu->getCoordinates(), - stateGpu->getCoordinatesReadyOnDeviceEvent( - AtomLocality::NonLocal, simulationWork, stepWork)); + nbv->convertCoordinatesGpu( + AtomLocality::NonLocal, + stateGpu->getCoordinates(), + stateGpu->getCoordinatesReadyOnDeviceEvent( + AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched)); } else { @@ -2085,13 +2104,11 @@ void do_force(FILE* fplog, { // If there exist CPU forces, data from halo exchange should accumulate into these bool accumulateForces = domainWork.haveCpuLocalForceWork; - if (!accumulateForces) - { - // Force halo exchange will set a subset of local atoms with remote non-local data - // First clear local portion of force array, so that untouched atoms are zero - stateGpu->clearForcesOnGpu(AtomLocality::Local); - } - communicateGpuHaloForces(*cr, accumulateForces); + gmx::FixedCapacityVector gpuForceHaloDependencies; + gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local)); + gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal)); + + communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies); } else { @@ -2232,15 +2249,7 @@ void do_force(FILE* fplog, // These should be unified. if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo) { - // Note: AtomLocality::All is used for the non-DD case because, as in this - // case copyForcesToGpu() uses a separate stream, it allows overlap of - // CPU force H2D with GPU force tasks on all streams including those in the - // local stream which would otherwise be implicit dependencies for the - // transfer and would not overlap. - auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local - : AtomLocality::All; - - stateGpu->copyForcesToGpu(forceWithShift, locality); + stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local); } if (stepWork.computeNonbondedForces) diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp index 970bcd6781..85c29398e0 100644 --- a/src/gromacs/mdrun/md.cpp +++ b/src/gromacs/mdrun/md.cpp @@ -1522,18 +1522,17 @@ void gmx::LegacySimulator::do_md() && do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple)); // This applies Leap-Frog, LINCS and SETTLE in succession - integrator->integrate( - stateGpu->getForcesReadyOnDeviceEvent( - AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps), - ir->delta_t, - true, - bCalcVir, - shake_vir, - doTemperatureScaling, - ekind->tcstat, - doParrinelloRahman, - ir->nstpcouple * ir->delta_t, - M); + integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent( + runScheduleWork->stepWork, runScheduleWork->simulationWork), + ir->delta_t, + true, + bCalcVir, + shake_vir, + doTemperatureScaling, + ekind->tcstat, + doParrinelloRahman, + ir->nstpcouple * ir->delta_t, + M); // Copy velocities D2H after update if: // - Globals are computed this step (includes the energy output steps). diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu.h b/src/gromacs/mdtypes/state_propagator_data_gpu.h index 9e8dc576a6..ad07df2c13 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu.h +++ b/src/gromacs/mdtypes/state_propagator_data_gpu.h @@ -188,15 +188,17 @@ public: * steps and if update is not offloaded, the coordinates are provided by the H2D copy and the * returned synchronizer indicates that the copy is complete. * - * \param[in] atomLocality Locality of the particles to wait for. - * \param[in] simulationWork The simulation lifetime flags. - * \param[in] stepWork The step lifetime flags. + * \param[in] atomLocality Locality of the particles to wait for. + * \param[in] simulationWork The simulation lifetime flags. + * \param[in] stepWork The step lifetime flags. + * \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched. * * \returns The event to synchronize the stream that consumes coordinates on device. */ GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality, const SimulationWorkload& simulationWork, - const StepWorkload& stepWork); + const StepWorkload& stepWork, + GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr); /*! \brief Blocking wait until coordinates are copied to the device. * @@ -212,12 +214,15 @@ public: */ void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent); - /*! \brief Copy positions from the GPU memory. + /*! \brief Copy positions from the GPU memory, with an optional explicit dependency. * * \param[in] h_x Positions buffer in the host memory. * \param[in] atomLocality Locality of the particles to copy. + * \param[in] dependency Dependency event for this operation. */ - void copyCoordinatesFromGpu(gmx::ArrayRef h_x, AtomLocality atomLocality); + void copyCoordinatesFromGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality, + GpuEventSynchronizer* dependency = nullptr); /*! \brief Wait until coordinates are available on the host. * @@ -271,8 +276,9 @@ public: /*! \brief Clear forces in the GPU memory. * * \param[in] atomLocality Locality of the particles to clear. + * \param[in] dependency Dependency event for this operation. */ - void clearForcesOnGpu(AtomLocality atomLocality); + void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency); /*! \brief Get the event synchronizer for the forces ready on device. * @@ -281,20 +287,27 @@ public: * 1. The forces are copied to the device (when GPU buffer ops are off) * 2. The forces are reduced on the device (GPU buffer ops are on) * - * \todo Pass step workload instead of the useGpuFBufferOps boolean. - * - * \param[in] atomLocality Locality of the particles to wait for. - * \param[in] useGpuFBufferOps If the force buffer ops are offloaded to the GPU. + * \param[in] stepWork Step workload flags + * \param[in] simulationWork Simulation workload flags * * \returns The event to synchronize the stream that consumes forces on device. */ - GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps); + GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork, + SimulationWorkload simulationWork); /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU. * - * \returns The event to mark when forces are reduced on the GPU. + * \param[in] atomLocality Locality of the particles to wait for. + * \returns The event to mark when forces are reduced on the GPU. + */ + GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality); + + /*! \brief Getter for the event synchronizer for the forces are ready on the GPU. + * + * \param[in] atomLocality Locality of the particles to wait for. + * \returns The event to mark when forces are ready on the GPU. */ - GpuEventSynchronizer* fReducedOnDevice(); + GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality); /*! \brief Copy forces from the GPU memory. * diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp index d4fc488720..618856d0ce 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl.cpp @@ -103,7 +103,8 @@ DeviceBuffer StatePropagatorDataGpu::getCoordinates() GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent( AtomLocality /* atomLocality */, const SimulationWorkload& /* simulationWork */, - const StepWorkload& /* stepWork */) + const StepWorkload& /* stepWork */, + GpuEventSynchronizer* /* gpuCoordinateHaloLaunched */) { GMX_ASSERT(!impl_, "A CPU stub method from GPU state propagator data was called instead of one from " @@ -141,14 +142,14 @@ void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality /* atomLoca } void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef /* h_x */, - AtomLocality /* atomLocality */) + AtomLocality /* atomLocality */, + GpuEventSynchronizer* /*dependency */) { GMX_ASSERT(!impl_, "A CPU stub method from GPU state propagator data was called instead of one from " "GPU implementation."); } - DeviceBuffer StatePropagatorDataGpu::getVelocities() { GMX_ASSERT(!impl_, @@ -197,15 +198,24 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRef h_x, AtomLocality atomLocality); + void copyCoordinatesFromGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality, + GpuEventSynchronizer* dependency = nullptr); /*! \brief Wait until coordinates are available on the host. * @@ -253,8 +258,9 @@ public: /*! \brief Clear forces in the GPU memory. * * \param[in] atomLocality Locality of the particles to clear. + * \param[in] dependency Dependency event for this operation. */ - void clearForcesOnGpu(AtomLocality atomLocality); + void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency); /*! \brief Get the event synchronizer for the forces ready on device. * @@ -263,20 +269,27 @@ public: * 1. The forces are copied to the device (when GPU buffer ops are off) * 2. The forces are reduced on the device (GPU buffer ops are on) * - * \todo Pass step workload instead of the useGpuFBufferOps boolean. - * - * \param[in] atomLocality Locality of the particles to wait for. - * \param[in] useGpuFBufferOps If the force buffer ops are offloaded to the GPU. + * \param[in] stepWork Step workload flags + * \param[in] simulationWork Simulation workload flags * * \returns The event to synchronize the stream that consumes forces on device. */ - GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps); + GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork, + SimulationWorkload simulationWork); - /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU. + /*! \brief Getter for the event synchronizer for when forces are reduced on the GPU. * - * \returns The event to mark when forces are reduced on the GPU. + * \param[in] atomLocality Locality of the particles to wait for. + * \returns The event to mark when forces are reduced on the GPU. + */ + GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality); + + /*! \brief Getter for the event synchronizer for the forces are ready for GPU update. + * + * \param[in] atomLocality Locality of the particles to wait for. + * \returns The event to mark when forces are ready for GPU update. */ - GpuEventSynchronizer* fReducedOnDevice(); + GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality); /*! \brief Copy forces from the GPU memory. * @@ -327,6 +340,9 @@ private: EnumerationArray vCopyStreams_ = { { nullptr } }; // Streams to use for forces H2D and D2H copies (one event for each atom locality) EnumerationArray fCopyStreams_ = { { nullptr } }; + // Streams internal to this module + std::unique_ptr copyInStream_; + std::unique_ptr memsetStream_; /*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality) * @@ -343,8 +359,8 @@ private: //! An array of events that indicate H2D copy of forces is complete (one event for each atom locality) EnumerationArray fReadyOnDevice_; - //! An event that the forces were reduced on the GPU - GpuEventSynchronizer fReducedOnDevice_; + //! An array of events that indicate the forces were reduced on the GPU (one event for each atom locality) + EnumerationArray fReducedOnDevice_; //! An array of events that indicate D2H copy of forces is complete (one event for each atom locality) EnumerationArray fReadyOnHost_; diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp index aa63099ffa..30478ef046 100644 --- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp +++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp @@ -95,6 +95,9 @@ StatePropagatorDataGpu::Impl::Impl(const DeviceStreamManager& deviceStreamManage fCopyStreams_[AtomLocality::Local] = localStream_; fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_; fCopyStreams_[AtomLocality::All] = updateStream_; + + copyInStream_ = std::make_unique(deviceContext_, DeviceStreamPriority::Normal, false); + memsetStream_ = std::make_unique(deviceContext_, DeviceStreamPriority::Normal, false); } StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream, @@ -340,23 +343,28 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef h_x, AtomLocality atomLocality) +void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality, + GpuEventSynchronizer* dependency) { GMX_ASSERT(atomLocality < AtomLocality::All, formatString("Wrong atom locality. Only Local and NonLocal are allowed for " @@ -393,6 +403,11 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRefenqueueWaitEvent(*deviceStream); + } + wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu); wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData); @@ -476,11 +491,13 @@ DeviceBuffer StatePropagatorDataGpu::Impl::getForces() return d_f_; } +// Copy CPU forces to GPU using stream internal to this module to allow overlap +// with GPU force calculations. void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef h_f, AtomLocality atomLocality) { GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality."); - const DeviceStream* deviceStream = fCopyStreams_[atomLocality]; + DeviceStream* deviceStream = copyInStream_.get(); GMX_ASSERT(deviceStream != nullptr, "No stream is valid for copying forces with given atom locality."); @@ -494,10 +511,14 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRefenqueueWaitEvent(*deviceStream); + GMX_ASSERT(deviceStream != nullptr, "No stream is valid for clearing forces with given atom locality."); @@ -506,26 +527,33 @@ void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality) clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream); + fReadyOnDevice_[atomLocality].markEvent(*deviceStream); + wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData); wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu); } -GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality, - bool useGpuFBufferOps) +GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork, + SimulationWorkload simulationWork) { - if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps) + if (stepWork.useGpuFBufferOps && !simulationWork.useCpuPmePpCommunication) { - return &fReducedOnDevice_; + return &fReducedOnDevice_[AtomLocality::Local]; } else { - return &fReadyOnDevice_[atomLocality]; + return &fReadyOnDevice_[AtomLocality::Local]; } } -GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice() +GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice(AtomLocality atomLocality) +{ + return &fReducedOnDevice_[atomLocality]; +} + +GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality) { - return &fReducedOnDevice_; + return &fReadyOnDevice_[atomLocality]; } void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef h_f, AtomLocality atomLocality) @@ -617,9 +645,11 @@ void StatePropagatorDataGpu::copyCoordinatesToGpu(const gmx::ArrayRefgetCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork); + return impl_->getCoordinatesReadyOnDeviceEvent( + atomLocality, simulationWork, stepWork, gpuCoordinateHaloLaunched); } void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality) @@ -632,9 +662,11 @@ void StatePropagatorDataGpu::setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpd impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent); } -void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef h_x, AtomLocality atomLocality) +void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef h_x, + AtomLocality atomLocality, + GpuEventSynchronizer* dependency) { - return impl_->copyCoordinatesFromGpu(h_x, atomLocality); + return impl_->copyCoordinatesFromGpu(h_x, atomLocality, dependency); } void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality) @@ -675,20 +707,25 @@ void StatePropagatorDataGpu::copyForcesToGpu(const gmx::ArrayRefcopyForcesToGpu(h_f, atomLocality); } -void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality) +void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency) +{ + return impl_->clearForcesOnGpu(atomLocality, dependency); +} + +GpuEventSynchronizer* StatePropagatorDataGpu::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork, + SimulationWorkload simulationWork) { - return impl_->clearForcesOnGpu(atomLocality); + return impl_->getLocalForcesReadyOnDeviceEvent(stepWork, simulationWork); } -GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality, - bool useGpuFBufferOps) +GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality atomLocality) { - return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps); + return impl_->fReducedOnDevice(atomLocality); } -GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice() +GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality) { - return impl_->fReducedOnDevice(); + return impl_->fReadyOnDevice(atomLocality); } void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef h_f, AtomLocality atomLocality)