for (int pulse = cr.dd->gpuHaloExchange[d].size(); pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
{
cr.dd->gpuHaloExchange[d].push_back(std::make_unique<gmx::GpuHaloExchange>(
- cr.dd,
- d,
- cr.mpi_comm_mygroup,
- deviceStreamManager.context(),
- deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal),
- deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal),
- pulse,
- wcycle));
+ cr.dd, d, cr.mpi_comm_mygroup, deviceStreamManager.context(), pulse, wcycle));
}
}
}
}
}
-void communicateGpuHaloCoordinates(const t_commrec& cr,
- const matrix box,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec& cr,
+ const matrix box,
+ GpuEventSynchronizer* dependencyEvent)
{
+ GpuEventSynchronizer* eventPtr = dependencyEvent;
for (int d = 0; d < cr.dd->ndim; d++)
{
for (int pulse = 0; pulse < cr.dd->comm->cd[d].numPulses(); pulse++)
{
- cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+ eventPtr = cr.dd->gpuHaloExchange[d][pulse]->communicateHaloCoordinates(box, eventPtr);
}
}
+ return eventPtr;
}
-void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces)
+void communicateGpuHaloForces(const t_commrec& cr,
+ bool accumulateForces,
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
{
for (int d = cr.dd->ndim - 1; d >= 0; d--)
{
for (int pulse = cr.dd->comm->cd[d].numPulses() - 1; pulse >= 0; pulse--)
{
- cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces);
+ cr.dd->gpuHaloExchange[d][pulse]->communicateHaloForces(accumulateForces, dependencyEvents);
+ dependencyEvents->push_back(cr.dd->gpuHaloExchange[d][pulse]->getForcesReadyOnDeviceEvent());
}
}
}
class VirtualSitesHandler;
template<typename>
class ArrayRef;
+template<typename, size_t>
+class FixedCapacityVector;
} // namespace gmx
/*! \brief Returns the global topology atom number belonging to local atom index i.
/*! \brief GPU halo exchange of coordinates buffer.
- * \param [in] cr The commrec object
- * \param [in] box Coordinate box (from which shifts will be constructed)
- * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+ * \param [in] cr The commrec object
+ * \param [in] box Coordinate box (from which shifts will be constructed)
+ * \param [in] dependencyEvent Dependency event for this operation
+ * \returns Event recorded when this operation has been launched
*/
-void communicateGpuHaloCoordinates(const t_commrec& cr,
- const matrix box,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
-
+GpuEventSynchronizer* communicateGpuHaloCoordinates(const t_commrec& cr,
+ const matrix box,
+ GpuEventSynchronizer* dependencyEvent);
-/*! \brief GPU halo exchange of force buffer.
- * \param [in] cr The commrec object
+/*! \brief Wait for copy of nonlocal part of coordinate array from GPU to CPU
+ * following coordinate halo exchange
+ * \param [in] cr The commrec object
* \param [in] accumulateForces True if forces should accumulate, otherwise they are set
+ * \param [in] dependencyEvents Dependency events for this operation
*/
-void communicateGpuHaloForces(const t_commrec& cr, bool accumulateForces);
+void communicateGpuHaloForces(const t_commrec& cr,
+ bool accumulateForces,
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
/*! \brief Wraps the \c positions so that atoms from the same
* update group share the same periodic image wrt \c box.
#include "gromacs/gpu_utils/devicebuffer_datatype.h"
#include "gromacs/math/vectypes.h"
+#include "gromacs/utility/arrayref.h"
#include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/fixedcapacityvector.h"
#include "gromacs/utility/gmxmpi.h"
struct gmx_domdec_t;
public:
/*! \brief Creates GPU Halo Exchange object.
*
- * Coordinate Halo exchange will be performed in \c
- * StreamNonLocal, and the \c communicateHaloCoordinates
- * method must be called before any subsequent operations that
- * access non-local parts of the coordinate buffer (such as
- * the non-local non-bonded kernels). It also must be called
- * after the local coordinates buffer operations (where the
- * coordinates are copied to the device and hence the \c
- * coordinatesReadyOnDeviceEvent is recorded). Force Halo exchange
- * will be performed in \c streamNonLocal and the \c
- * communicateHaloForces method must be called after the
- * non-local buffer operations, after the local force buffer
- * has been copied to the GPU (if CPU forces are present), and
- * before the local buffer operations. The force halo exchange
- * does not yet support virial steps.
+ * Coordinate Halo exchange will be performed in its own stream
+ * with appropriate event-based synchronization, and the \c
+ * communicateHaloCoordinates method must be called before any
+ * subsequent operations that access non-local parts of the
+ * coordinate buffer (such as the non-local non-bonded
+ * kernels). It also must be called after the local coordinates
+ * buffer operations (where the coordinates are copied to the
+ * device and hence the \c coordinatesReadyOnDeviceEvent is
+ * recorded). Force Halo exchange will also be performed in its
+ * own stream with appropriate event-based synchronization, and
+ * the \c communicateHaloForces method must be called after the
+ * non-local buffer operations, after the local force buffer has
+ * been copied to the GPU (if CPU forces are present), and before
+ * the local buffer operations. The force halo exchange does not
+ * yet support virial steps.
*
* \param [inout] dd domdec structure
* \param [in] dimIndex the dimension index for this instance
* \param [in] mpi_comm_mysim communicator used for simulation
* \param [in] deviceContext GPU device context
- * \param [in] streamLocal local NB CUDA stream.
- * \param [in] streamNonLocal non-local NB CUDA stream.
* \param [in] pulse the communication pulse for this instance
* \param [in] wcycle The wallclock counter
*/
int dimIndex,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
- const DeviceStream& streamLocal,
- const DeviceStream& streamNonLocal,
int pulse,
gmx_wallcycle* wcycle);
~GpuHaloExchange();
* Must be called after local setCoordinates (which records an
* event when the coordinate data has been copied to the
* device).
- * \param [in] box Coordinate box (from which shifts will be constructed)
- * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+ * \param [in] box Coordinate box (from which shifts will be constructed)
+ * \param [in] dependencyEvent Dependency event for this operation
+ * \returns Event recorded when this operation has been launched
*/
- void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+ GpuEventSynchronizer* communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* dependencyEvent);
/*! \brief GPU halo exchange of force buffer.
* \param[in] accumulateForces True if forces should accumulate, otherwise they are set
+ * \param[in] dependencyEvents Dependency events for this operation
*/
- void communicateHaloForces(bool accumulateForces);
+ void communicateHaloForces(bool accumulateForces,
+ FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
/*! \brief Get the event synchronizer for the forces ready on device.
* \returns The event to synchronize the stream that consumes forces on device.
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
int /* dimIndex */,
MPI_Comm /* mpi_comm_mysim */,
const DeviceContext& /* deviceContext */,
- const DeviceStream& /*streamLocal */,
- const DeviceStream& /*streamNonLocal */,
int /*pulse */,
gmx_wallcycle* /*wcycle*/) :
impl_(nullptr)
}
/*!\brief apply X halo exchange stub. */
-void GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
- GpuEventSynchronizer* /*coordinatesOnDeviceEvent*/)
+GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix /* box */,
+ GpuEventSynchronizer* /*dependencyEvent*/)
{
GMX_ASSERT(!impl_,
"A CPU stub for GPU Halo Exchange exchange was called insted of the correct "
"implementation.");
+ return nullptr;
}
/*!\brief apply F halo exchange stub. */
-void GpuHaloExchange::communicateHaloForces(bool gmx_unused accumulateForces)
+void GpuHaloExchange::communicateHaloForces(bool /* accumulateForces */,
+ FixedCapacityVector<GpuEventSynchronizer*, 2>* /*dependencyEvents*/)
{
GMX_ASSERT(!impl_,
"A CPU stub for GPU Halo Exchange was called insted of the correct implementation.");
std::copy(ind.index.begin(), ind.index.end(), h_indexMap_.begin());
copyToDeviceBuffer(
- &d_indexMap_, h_indexMap_.data(), 0, newSize, nonLocalStream_, GpuApiCallBehavior::Async, nullptr);
+ &d_indexMap_, h_indexMap_.data(), 0, newSize, *haloStream_, GpuApiCallBehavior::Async, nullptr);
}
#if GMX_MPI
0,
mpi_comm_mysim_,
MPI_STATUS_IGNORE);
- remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
+ remoteCoordinatesReadyOnDeviceEvent->enqueueWaitEvent(*haloStream_);
}
-void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix box,
+ GpuEventSynchronizer* dependencyEvent)
{
-
wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu);
- if (pulse_ == 0)
- {
- // ensure stream waits until coordinate data is available on device
- coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
- }
+
+ // ensure stream waits until dependency has been satisfied
+ dependencyEvent->enqueueWaitEvent(*haloStream_);
wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
const auto kernelArgs = prepareGpuKernelArguments(
kernelFn, config, &sendBuf, &d_x, &indexMap, &size, &coordinateShift);
- launchGpuKernel(
- kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
+ launchGpuKernel(kernelFn, config, *haloStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
}
wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
// wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
// and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
- if (GMX_THREAD_MPI && pulse_ == 0)
+ if (GMX_THREAD_MPI && dimIndex_ == 0 && pulse_ == 0)
{
- enqueueWaitRemoteCoordinatesReadyEvent(coordinatesReadyOnDeviceEvent);
+ enqueueWaitRemoteCoordinatesReadyEvent(dependencyEvent);
}
float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
+ coordinateHaloLaunched_.markEvent(*haloStream_);
+
wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
+
+ return &coordinateHaloLaunched_;
}
// The following method should be called after non-local buffer operations,
-// and before the local buffer operations. It operates in the non-local stream.
-void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
+// and before the local buffer operations.
+void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces,
+ FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
{
+
// Consider time spent in communicateHaloData as Comm.F counter
// ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
wallcycle_start(wcycle_, WallCycleCounter::MoveF);
+ while (dependencyEvents->size() > 0)
+ {
+ auto dependency = dependencyEvents->back();
+ dependency->enqueueWaitEvent(*haloStream_);
+ dependencyEvents->pop_back();
+ }
+
float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
- // Communicate halo data (in non-local stream)
+ // Communicate halo data
communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
wallcycle_stop(wcycle_, WallCycleCounter::MoveF);
wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
float3* d_f = d_f_;
- // If this is the last pulse and index (noting the force halo
- // exchanges across multiple pulses and indices are called in
- // reverse order) then perform the following preparation
- // activities
- if ((pulse_ == (dd_->comm->cd[dimIndex_].numPulses() - 1)) && (dimIndex_ == (dd_->ndim - 1)))
- {
- // ensure non-local stream waits for local stream, due to dependence on
- // the previous H2D copy of CPU forces (if accumulateForces is true)
- // or local force clearing.
- GpuEventSynchronizer eventLocal;
- eventLocal.markEvent(localStream_);
- eventLocal.enqueueWaitEvent(nonLocalStream_);
- }
// Unpack halo buffer into force array
const auto kernelArgs =
prepareGpuKernelArguments(kernelFn, config, &d_f, &recvBuf, &indexMap, &size);
- launchGpuKernel(
- kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
+ launchGpuKernel(kernelFn, config, *haloStream_, nullptr, "Domdec GPU Apply F Halo Exchange", kernelArgs);
}
- if (pulse_ == 0)
- {
- fReadyOnDevice_.markEvent(nonLocalStream_);
- }
+ fReadyOnDevice_.markEvent(*haloStream_);
wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
// no need to wait for haloDataReadyOnDevice event if this rank is not sending any data
if (sendSize > 0)
{
- // wait for non local stream to complete all outstanding
+ // wait for halo stream to complete all outstanding
// activities, to ensure that buffer is up-to-date in GPU memory
// before transferring to remote rank
// ToDo: Replace stream synchronize with event synchronize
- nonLocalStream_.synchronize();
+ haloStream_->synchronize();
}
// perform halo exchange directly in device buffers
sendPtr,
sendSize * DIM * sizeof(float),
cudaMemcpyDeviceToDevice,
- nonLocalStream_.stream());
+ haloStream_->stream());
CU_RET_ERR(stat, "cudaMemcpyAsync on GPU Domdec CUDA direct data transfer failed");
}
GMX_ASSERT(haloDataTransferLaunched_ != nullptr,
"Halo exchange requires valid event to synchronize data transfer initiated in "
"remote rank");
- haloDataTransferLaunched_->markEvent(nonLocalStream_);
+ haloDataTransferLaunched_->markEvent(*haloStream_);
MPI_Sendrecv(&haloDataTransferLaunched_,
sizeof(GpuEventSynchronizer*), //NOLINT(bugprone-sizeof-expression)
mpi_comm_mysim_,
MPI_STATUS_IGNORE);
- haloDataTransferRemote->enqueueWaitEvent(nonLocalStream_);
+ haloDataTransferRemote->enqueueWaitEvent(*haloStream_);
#else
GMX_UNUSED_VALUE(sendRank);
GMX_UNUSED_VALUE(recvRank);
int dimIndex,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
- const DeviceStream& localStream,
- const DeviceStream& nonLocalStream,
int pulse,
gmx_wallcycle* wcycle) :
dd_(dd),
haloDataTransferLaunched_(GMX_THREAD_MPI ? new GpuEventSynchronizer() : nullptr),
mpi_comm_mysim_(mpi_comm_mysim),
deviceContext_(deviceContext),
- localStream_(localStream),
- nonLocalStream_(nonLocalStream),
+ haloStream_(new DeviceStream(deviceContext, DeviceStreamPriority::High, false)),
dimIndex_(dimIndex),
pulse_(pulse),
wcycle_(wcycle)
int dimIndex,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
- const DeviceStream& localStream,
- const DeviceStream& nonLocalStream,
int pulse,
gmx_wallcycle* wcycle) :
- impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, localStream, nonLocalStream, pulse, wcycle))
+ impl_(new Impl(dd, dimIndex, mpi_comm_mysim, deviceContext, pulse, wcycle))
{
}
impl_->reinitHalo(asFloat3(d_coordinatesBuffer), asFloat3(d_forcesBuffer));
}
-void GpuHaloExchange::communicateHaloCoordinates(const matrix box,
- GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
+GpuEventSynchronizer* GpuHaloExchange::communicateHaloCoordinates(const matrix box,
+ GpuEventSynchronizer* dependencyEvent)
{
- impl_->communicateHaloCoordinates(box, coordinatesReadyOnDeviceEvent);
+ return impl_->communicateHaloCoordinates(box, dependencyEvent);
}
-void GpuHaloExchange::communicateHaloForces(bool accumulateForces)
+void GpuHaloExchange::communicateHaloForces(bool accumulateForces,
+ FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents)
{
- impl_->communicateHaloForces(accumulateForces);
+ impl_->communicateHaloForces(accumulateForces, dependencyEvents);
}
GpuEventSynchronizer* GpuHaloExchange::getForcesReadyOnDeviceEvent()
* \param [in] dimIndex the dimension index for this instance
* \param [in] mpi_comm_mysim communicator used for simulation
* \param [in] deviceContext GPU device context
- * \param [in] localStream local NB CUDA stream
- * \param [in] nonLocalStream non-local NB CUDA stream
* \param [in] pulse the communication pulse for this instance
* \param [in] wcycle The wallclock counter
*/
int dimIndex,
MPI_Comm mpi_comm_mysim,
const DeviceContext& deviceContext,
- const DeviceStream& localStream,
- const DeviceStream& nonLocalStream,
int pulse,
gmx_wallcycle* wcycle);
~Impl();
/*! \brief
* GPU halo exchange of coordinates buffer
* \param [in] box Coordinate box (from which shifts will be constructed)
- * \param [in] coordinatesReadyOnDeviceEvent event recorded when coordinates have been copied to device
+ * \param [in] dependencyEvent Dependency event for this operation
+ * \returns Event recorded when this operation has been launched
*/
- void communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
+ GpuEventSynchronizer* communicateHaloCoordinates(const matrix box, GpuEventSynchronizer* dependencyEvent);
/*! \brief GPU halo exchange of force buffer
- * \param[in] accumulateForces True if forces should accumulate, otherwise they are set
+ * \param [in] accumulateForces True if forces should accumulate, otherwise they are set
+ * \param [in] dependencyEvents Dependency events for this operation
*/
- void communicateHaloForces(bool accumulateForces);
+ void communicateHaloForces(bool accumulateForces,
+ FixedCapacityVector<GpuEventSynchronizer*, 2>* dependencyEvents);
/*! \brief Get the event synchronizer for the forces ready on device.
* \returns The event to synchronize the stream that consumes forces on device.
int recvSize,
int recvRank);
- /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in non-local
- * stream \param [in] eventSync event recorded when coordinates/forces are ready to device
+ /*! \brief Exchange coordinate-ready event with neighbor ranks and enqueue wait in halo stream
+ * \param [in] eventSync event recorded when coordinates/forces are ready to device
*/
void enqueueWaitRemoteCoordinatesReadyEvent(GpuEventSynchronizer* coordinatesReadyOnDeviceEvent);
MPI_Comm mpi_comm_mysim_;
//! GPU context object
const DeviceContext& deviceContext_;
- //! CUDA stream for local non-bonded calculations
- const DeviceStream& localStream_;
- //! CUDA stream for non-local non-bonded calculations
- const DeviceStream& nonLocalStream_;
+ //! CUDA stream for this halo exchange
+ DeviceStream* haloStream_;
//! full coordinates buffer in GPU memory
float3* d_x_ = nullptr;
//! full forces buffer in GPU memory
gmx_wallcycle* wcycle_ = nullptr;
//! The atom offset for receive (x) or send (f) for dimension index and pulse corresponding to this halo exchange instance
int atomOffset_ = 0;
+ //! Event triggered when coordinate halo has been launched
+ GpuEventSynchronizer coordinateHaloLaunched_;
};
} // namespace gmx
{
for (int pulse = 0; pulse < dd->comm->cd[d].numPulses(); pulse++)
{
- gpuHaloExchange[d].push_back(GpuHaloExchange(
- dd, d, MPI_COMM_WORLD, deviceContext, deviceStream, deviceStream, pulse, nullptr));
+ gpuHaloExchange[d].push_back(
+ GpuHaloExchange(dd, d, MPI_COMM_WORLD, deviceContext, pulse, nullptr));
}
}
const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
|| runScheduleWork->simulationWork.havePpDomainDecomposition;
const int atomStart = 0;
- fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
- nbv->getNumAtoms(AtomLocality::Local),
- nbv->getGridIndices(),
- atomStart,
- accumulate,
- stateGpu->fReducedOnDevice());
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
+ stateGpu->getForces(),
+ nbv->getNumAtoms(AtomLocality::Local),
+ nbv->getGridIndices(),
+ atomStart,
+ accumulate,
+ stateGpu->fReducedOnDevice(AtomLocality::Local));
// register forces and add dependencies
fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
}
}
- if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
+ if (runScheduleWork->domainWork.haveCpuLocalForceWork
+ || (runScheduleWork->simulationWork.havePpDomainDecomposition
+ && !runScheduleWork->simulationWork.useGpuHaloExchange))
{
- // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
- if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
- {
- const bool useGpuForceBufferOps = true;
- fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
- stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::All, useGpuForceBufferOps));
- }
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+ stateGpu->fReadyOnDevice(AtomLocality::Local));
}
if (runScheduleWork->simulationWork.useGpuHaloExchange)
const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
|| runScheduleWork->domainWork.haveFreeEnergyWork;
const int atomStart = dd_numHomeAtoms(*cr->dd);
- fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(stateGpu->getForces(),
- nbv->getNumAtoms(AtomLocality::NonLocal),
- nbv->getGridIndices(),
- atomStart,
- accumulate);
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
+ stateGpu->getForces(),
+ nbv->getNumAtoms(AtomLocality::NonLocal),
+ nbv->getGridIndices(),
+ atomStart,
+ accumulate,
+ stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
// register forces and add dependencies
- // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(
Nbnxm::gpu_get_f(nbv->gpu_nbv));
+
+ if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
+ stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
+ }
}
}
runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork);
const StepWorkload& stepWork = runScheduleWork->stepWork;
+ if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork)
+ {
+ // GPU Force halo exchange will set a subset of local atoms with remote non-local data
+ // First clear local portion of force array, so that untouched atoms are zero.
+ // The dependency for this is that forces from previous timestep have been consumed,
+ // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked.
+ stateGpu->clearForcesOnGpu(AtomLocality::Local,
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
+ AtomLocality::Local, simulationWork, stepWork));
+ }
+
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
* decomposition (and not during the previous step as usual).
}
else
{
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
if (stepWork.useGpuXHalo)
{
// The following must be called after local setCoordinates (which records an event
// when the coordinate data has been copied to the device).
- communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
+ gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
{
// non-local part of coordinate buffer must be copied back to host for CPU work
- stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
+ stateGpu->copyCoordinatesFromGpu(
+ x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
}
}
else
{
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
}
- nbv->convertCoordinatesGpu(AtomLocality::NonLocal,
- stateGpu->getCoordinates(),
- stateGpu->getCoordinatesReadyOnDeviceEvent(
- AtomLocality::NonLocal, simulationWork, stepWork));
+ nbv->convertCoordinatesGpu(
+ AtomLocality::NonLocal,
+ stateGpu->getCoordinates(),
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
+ AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched));
}
else
{
{
// If there exist CPU forces, data from halo exchange should accumulate into these
bool accumulateForces = domainWork.haveCpuLocalForceWork;
- if (!accumulateForces)
- {
- // Force halo exchange will set a subset of local atoms with remote non-local data
- // First clear local portion of force array, so that untouched atoms are zero
- stateGpu->clearForcesOnGpu(AtomLocality::Local);
- }
- communicateGpuHaloForces(*cr, accumulateForces);
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
+ gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
+ gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
+
+ communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
}
else
{
// These should be unified.
if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
{
- // Note: AtomLocality::All is used for the non-DD case because, as in this
- // case copyForcesToGpu() uses a separate stream, it allows overlap of
- // CPU force H2D with GPU force tasks on all streams including those in the
- // local stream which would otherwise be implicit dependencies for the
- // transfer and would not overlap.
- auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
- : AtomLocality::All;
-
- stateGpu->copyForcesToGpu(forceWithShift, locality);
+ stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
}
if (stepWork.computeNonbondedForces)
&& do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
// This applies Leap-Frog, LINCS and SETTLE in succession
- integrator->integrate(
- stateGpu->getForcesReadyOnDeviceEvent(
- AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
- ir->delta_t,
- true,
- bCalcVir,
- shake_vir,
- doTemperatureScaling,
- ekind->tcstat,
- doParrinelloRahman,
- ir->nstpcouple * ir->delta_t,
- M);
+ integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
+ runScheduleWork->stepWork, runScheduleWork->simulationWork),
+ ir->delta_t,
+ true,
+ bCalcVir,
+ shake_vir,
+ doTemperatureScaling,
+ ekind->tcstat,
+ doParrinelloRahman,
+ ir->nstpcouple * ir->delta_t,
+ M);
// Copy velocities D2H after update if:
// - Globals are computed this step (includes the energy output steps).
* steps and if update is not offloaded, the coordinates are provided by the H2D copy and the
* returned synchronizer indicates that the copy is complete.
*
- * \param[in] atomLocality Locality of the particles to wait for.
- * \param[in] simulationWork The simulation lifetime flags.
- * \param[in] stepWork The step lifetime flags.
+ * \param[in] atomLocality Locality of the particles to wait for.
+ * \param[in] simulationWork The simulation lifetime flags.
+ * \param[in] stepWork The step lifetime flags.
+ * \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
*
* \returns The event to synchronize the stream that consumes coordinates on device.
*/
GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
const SimulationWorkload& simulationWork,
- const StepWorkload& stepWork);
+ const StepWorkload& stepWork,
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
/*! \brief Blocking wait until coordinates are copied to the device.
*
*/
void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
- /*! \brief Copy positions from the GPU memory.
+ /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
*
* \param[in] h_x Positions buffer in the host memory.
* \param[in] atomLocality Locality of the particles to copy.
+ * \param[in] dependency Dependency event for this operation.
*/
- void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality);
+ void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+ AtomLocality atomLocality,
+ GpuEventSynchronizer* dependency = nullptr);
/*! \brief Wait until coordinates are available on the host.
*
/*! \brief Clear forces in the GPU memory.
*
* \param[in] atomLocality Locality of the particles to clear.
+ * \param[in] dependency Dependency event for this operation.
*/
- void clearForcesOnGpu(AtomLocality atomLocality);
+ void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
/*! \brief Get the event synchronizer for the forces ready on device.
*
* 1. The forces are copied to the device (when GPU buffer ops are off)
* 2. The forces are reduced on the device (GPU buffer ops are on)
*
- * \todo Pass step workload instead of the useGpuFBufferOps boolean.
- *
- * \param[in] atomLocality Locality of the particles to wait for.
- * \param[in] useGpuFBufferOps If the force buffer ops are offloaded to the GPU.
+ * \param[in] stepWork Step workload flags
+ * \param[in] simulationWork Simulation workload flags
*
* \returns The event to synchronize the stream that consumes forces on device.
*/
- GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps);
+ GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+ SimulationWorkload simulationWork);
/*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
*
- * \returns The event to mark when forces are reduced on the GPU.
+ * \param[in] atomLocality Locality of the particles to wait for.
+ * \returns The event to mark when forces are reduced on the GPU.
+ */
+ GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+ /*! \brief Getter for the event synchronizer for the forces are ready on the GPU.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ * \returns The event to mark when forces are ready on the GPU.
*/
- GpuEventSynchronizer* fReducedOnDevice();
+ GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
/*! \brief Copy forces from the GPU memory.
*
GpuEventSynchronizer* StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(
AtomLocality /* atomLocality */,
const SimulationWorkload& /* simulationWork */,
- const StepWorkload& /* stepWork */)
+ const StepWorkload& /* stepWork */,
+ GpuEventSynchronizer* /* gpuCoordinateHaloLaunched */)
{
GMX_ASSERT(!impl_,
"A CPU stub method from GPU state propagator data was called instead of one from "
}
void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> /* h_x */,
- AtomLocality /* atomLocality */)
+ AtomLocality /* atomLocality */,
+ GpuEventSynchronizer* /*dependency */)
{
GMX_ASSERT(!impl_,
"A CPU stub method from GPU state propagator data was called instead of one from "
"GPU implementation.");
}
-
DeviceBuffer<RVec> StatePropagatorDataGpu::getVelocities()
{
GMX_ASSERT(!impl_,
"GPU implementation.");
}
-void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */)
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality /* atomLocality */,
+ GpuEventSynchronizer* /* dependency */)
+{
+ GMX_ASSERT(!impl_,
+ "A CPU stub method from GPU state propagator data was called instead of one from "
+ "GPU implementation.");
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::getLocalForcesReadyOnDeviceEvent(StepWorkload /* stepWork */,
+ SimulationWorkload /* simulationWork */)
{
GMX_ASSERT(!impl_,
"A CPU stub method from GPU state propagator data was called instead of one from "
"GPU implementation.");
+ return nullptr;
}
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality /* atomLocality */,
- bool /* useGpuFBufferOps */)
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality /*atomLocality*/)
{
GMX_ASSERT(!impl_,
"A CPU stub method from GPU state propagator data was called instead of one from "
return nullptr;
}
-GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality /*atomLocality*/)
{
GMX_ASSERT(!impl_,
"A CPU stub method from GPU state propagator data was called instead of one from "
* \param[in] atomLocality Locality of the particles to wait for.
* \param[in] simulationWork The simulation lifetime flags.
* \param[in] stepWork The step lifetime flags.
+ * \param[in] gpuCoordinateHaloLaunched Event recorded when GPU coordinate halo has been launched.
*
* \returns The event to synchronize the stream that consumes coordinates on device.
*/
GpuEventSynchronizer* getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
const SimulationWorkload& simulationWork,
- const StepWorkload& stepWork);
+ const StepWorkload& stepWork,
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr);
/*! \brief Blocking wait until coordinates are copied to the device.
*
*/
void setXUpdatedOnDeviceEvent(GpuEventSynchronizer* xUpdatedOnDeviceEvent);
- /*! \brief Copy positions from the GPU memory.
+ /*! \brief Copy positions from the GPU memory, with an optional explicit dependency.
*
* \param[in] h_x Positions buffer in the host memory.
* \param[in] atomLocality Locality of the particles to copy.
+ * \param[in] dependency Dependency event for this operation.
*/
- void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality);
+ void copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+ AtomLocality atomLocality,
+ GpuEventSynchronizer* dependency = nullptr);
/*! \brief Wait until coordinates are available on the host.
*
/*! \brief Clear forces in the GPU memory.
*
* \param[in] atomLocality Locality of the particles to clear.
+ * \param[in] dependency Dependency event for this operation.
*/
- void clearForcesOnGpu(AtomLocality atomLocality);
+ void clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency);
/*! \brief Get the event synchronizer for the forces ready on device.
*
* 1. The forces are copied to the device (when GPU buffer ops are off)
* 2. The forces are reduced on the device (GPU buffer ops are on)
*
- * \todo Pass step workload instead of the useGpuFBufferOps boolean.
- *
- * \param[in] atomLocality Locality of the particles to wait for.
- * \param[in] useGpuFBufferOps If the force buffer ops are offloaded to the GPU.
+ * \param[in] stepWork Step workload flags
+ * \param[in] simulationWork Simulation workload flags
*
* \returns The event to synchronize the stream that consumes forces on device.
*/
- GpuEventSynchronizer* getForcesReadyOnDeviceEvent(AtomLocality atomLocality, bool useGpuFBufferOps);
+ GpuEventSynchronizer* getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+ SimulationWorkload simulationWork);
- /*! \brief Getter for the event synchronizer for the forces are reduced on the GPU.
+ /*! \brief Getter for the event synchronizer for when forces are reduced on the GPU.
*
- * \returns The event to mark when forces are reduced on the GPU.
+ * \param[in] atomLocality Locality of the particles to wait for.
+ * \returns The event to mark when forces are reduced on the GPU.
+ */
+ GpuEventSynchronizer* fReducedOnDevice(AtomLocality atomLocality);
+
+ /*! \brief Getter for the event synchronizer for the forces are ready for GPU update.
+ *
+ * \param[in] atomLocality Locality of the particles to wait for.
+ * \returns The event to mark when forces are ready for GPU update.
*/
- GpuEventSynchronizer* fReducedOnDevice();
+ GpuEventSynchronizer* fReadyOnDevice(AtomLocality atomLocality);
/*! \brief Copy forces from the GPU memory.
*
EnumerationArray<AtomLocality, const DeviceStream*> vCopyStreams_ = { { nullptr } };
// Streams to use for forces H2D and D2H copies (one event for each atom locality)
EnumerationArray<AtomLocality, const DeviceStream*> fCopyStreams_ = { { nullptr } };
+ // Streams internal to this module
+ std::unique_ptr<DeviceStream> copyInStream_;
+ std::unique_ptr<DeviceStream> memsetStream_;
/*! \brief An array of events that indicate H2D copy is complete (one event for each atom locality)
*
//! An array of events that indicate H2D copy of forces is complete (one event for each atom locality)
EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnDevice_;
- //! An event that the forces were reduced on the GPU
- GpuEventSynchronizer fReducedOnDevice_;
+ //! An array of events that indicate the forces were reduced on the GPU (one event for each atom locality)
+ EnumerationArray<AtomLocality, GpuEventSynchronizer> fReducedOnDevice_;
//! An array of events that indicate D2H copy of forces is complete (one event for each atom locality)
EnumerationArray<AtomLocality, GpuEventSynchronizer> fReadyOnHost_;
fCopyStreams_[AtomLocality::Local] = localStream_;
fCopyStreams_[AtomLocality::NonLocal] = nonLocalStream_;
fCopyStreams_[AtomLocality::All] = updateStream_;
+
+ copyInStream_ = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::Normal, false);
+ memsetStream_ = std::make_unique<DeviceStream>(deviceContext_, DeviceStreamPriority::Normal, false);
}
StatePropagatorDataGpu::Impl::Impl(const DeviceStream* pmeStream,
wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
-GpuEventSynchronizer*
-StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
- const SimulationWorkload& simulationWork,
- const StepWorkload& stepWork)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(
+ AtomLocality atomLocality,
+ const SimulationWorkload& simulationWork,
+ const StepWorkload& stepWork,
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched)
{
// The provider of the coordinates may be different for local atoms. If the update is offloaded
// and this is not a neighbor search step, then the consumer needs to wait for the update
// to complete. Otherwise, the coordinates are copied from the host and we need to wait for
- // the copy event. Non-local coordinates are always provided by the H2D copy.
- //
- // TODO: This should be reconsidered to support the halo exchange.
+ // the copy event. Non-local coordinates are provided by the GPU halo exchange (if active), otherwise by H2D copy.
//
// In OpenCL no events are used as coordinate sync is not necessary
if (GMX_GPU_OPENCL)
{
return nullptr;
}
+ if (atomLocality == AtomLocality::NonLocal && stepWork.useGpuXHalo)
+ {
+ GMX_ASSERT(gpuCoordinateHaloLaunched != nullptr,
+ "GPU halo exchange is active but its completion event is null.");
+ return gpuCoordinateHaloLaunched;
+ }
if (atomLocality == AtomLocality::Local && simulationWork.useGpuUpdate && !stepWork.doNeighborSearch)
{
GMX_ASSERT(xUpdatedOnDeviceEvent_ != nullptr, "The event synchronizer can not be nullptr.");
xUpdatedOnDeviceEvent_ = xUpdatedOnDeviceEvent;
}
-void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x, AtomLocality atomLocality)
+void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVec> h_x,
+ AtomLocality atomLocality,
+ GpuEventSynchronizer* dependency)
{
GMX_ASSERT(atomLocality < AtomLocality::All,
formatString("Wrong atom locality. Only Local and NonLocal are allowed for "
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying positions with given atom locality.");
+ if (dependency != nullptr)
+ {
+ dependency->enqueueWaitEvent(*deviceStream);
+ }
+
wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
return d_f_;
}
+// Copy CPU forces to GPU using stream internal to this module to allow overlap
+// with GPU force calculations.
void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx::RVec> h_f,
AtomLocality atomLocality)
{
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+ DeviceStream* deviceStream = copyInStream_.get();
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for copying forces with given atom locality.");
wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
-void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
+void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency)
{
GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
- const DeviceStream* deviceStream = fCopyStreams_[atomLocality];
+ DeviceStream* deviceStream = memsetStream_.get();
+
+ GMX_ASSERT(dependency != nullptr, "Dependency is not valid for clearing forces.");
+ dependency->enqueueWaitEvent(*deviceStream);
+
GMX_ASSERT(deviceStream != nullptr,
"No stream is valid for clearing forces with given atom locality.");
clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
+ fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
+
wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
}
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
- bool useGpuFBufferOps)
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+ SimulationWorkload simulationWork)
{
- if ((atomLocality == AtomLocality::Local || atomLocality == AtomLocality::NonLocal) && useGpuFBufferOps)
+ if (stepWork.useGpuFBufferOps && !simulationWork.useCpuPmePpCommunication)
{
- return &fReducedOnDevice_;
+ return &fReducedOnDevice_[AtomLocality::Local];
}
else
{
- return &fReadyOnDevice_[atomLocality];
+ return &fReadyOnDevice_[AtomLocality::Local];
}
}
-GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReducedOnDevice(AtomLocality atomLocality)
+{
+ return &fReducedOnDevice_[atomLocality];
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::Impl::fReadyOnDevice(AtomLocality atomLocality)
{
- return &fReducedOnDevice_;
+ return &fReadyOnDevice_[atomLocality];
}
void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_f, AtomLocality atomLocality)
GpuEventSynchronizer*
StatePropagatorDataGpu::getCoordinatesReadyOnDeviceEvent(AtomLocality atomLocality,
const SimulationWorkload& simulationWork,
- const StepWorkload& stepWork)
+ const StepWorkload& stepWork,
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched)
{
- return impl_->getCoordinatesReadyOnDeviceEvent(atomLocality, simulationWork, stepWork);
+ return impl_->getCoordinatesReadyOnDeviceEvent(
+ atomLocality, simulationWork, stepWork, gpuCoordinateHaloLaunched);
}
void StatePropagatorDataGpu::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
impl_->setXUpdatedOnDeviceEvent(xUpdatedOnDeviceEvent);
}
-void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x, AtomLocality atomLocality)
+void StatePropagatorDataGpu::copyCoordinatesFromGpu(gmx::ArrayRef<RVec> h_x,
+ AtomLocality atomLocality,
+ GpuEventSynchronizer* dependency)
{
- return impl_->copyCoordinatesFromGpu(h_x, atomLocality);
+ return impl_->copyCoordinatesFromGpu(h_x, atomLocality, dependency);
}
void StatePropagatorDataGpu::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
return impl_->copyForcesToGpu(h_f, atomLocality);
}
-void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality)
+void StatePropagatorDataGpu::clearForcesOnGpu(AtomLocality atomLocality, GpuEventSynchronizer* dependency)
+{
+ return impl_->clearForcesOnGpu(atomLocality, dependency);
+}
+
+GpuEventSynchronizer* StatePropagatorDataGpu::getLocalForcesReadyOnDeviceEvent(StepWorkload stepWork,
+ SimulationWorkload simulationWork)
{
- return impl_->clearForcesOnGpu(atomLocality);
+ return impl_->getLocalForcesReadyOnDeviceEvent(stepWork, simulationWork);
}
-GpuEventSynchronizer* StatePropagatorDataGpu::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
- bool useGpuFBufferOps)
+GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice(AtomLocality atomLocality)
{
- return impl_->getForcesReadyOnDeviceEvent(atomLocality, useGpuFBufferOps);
+ return impl_->fReducedOnDevice(atomLocality);
}
-GpuEventSynchronizer* StatePropagatorDataGpu::fReducedOnDevice()
+GpuEventSynchronizer* StatePropagatorDataGpu::fReadyOnDevice(AtomLocality atomLocality)
{
- return impl_->fReducedOnDevice();
+ return impl_->fReadyOnDevice(atomLocality);
}
void StatePropagatorDataGpu::copyForcesFromGpu(gmx::ArrayRef<RVec> h_f, AtomLocality atomLocality)