#include "gromacs/domdec/partition.h"
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#include "gromacs/ewald/pme_pp.h"
#include "gromacs/ewald/pme_pp_comm_gpu.h"
#include "gromacs/gmxlib/network.h"
*
* \param[in] step The step number, used for checking and printing
* \param[in] enerd The energy data; the non-bonded group energies need to be added to
- * enerd.term[F_EPOT] before calling this routine \param[in] inputrec The input record
+ * \c enerd.term[F_EPOT] before calling this routine
+ * \param[in] inputrec The input record
*/
static void checkPotentialEnergyValidity(int64_t step, const gmx_enerdata_t& enerd, const t_inputrec& inputrec)
{
gmx_wallcycle* wcycle)
{
pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
- pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
+ bool useGpuDirectComm = false;
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+ pme_gpu_launch_spread(
+ pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
}
/*! \brief Launch the FFT and gather stages of PME GPU
GMX_ASSERT(simulationWork.useGpuNonbonded,
"Can only offload buffer ops if nonbonded computation is also offloaded");
}
- flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
+ flags.useGpuXBufferOps = simulationWork.useGpuBufferOps && !flags.doNeighborSearch;
// on virial steps the CPU reduction path is taken
flags.useGpuFBufferOps = simulationWork.useGpuBufferOps && !flags.computeVirial;
const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps
&& (rankHasGpuPmeTask || simulationWork.useGpuPmePpCommunication);
- flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+ flags.useGpuXHalo = simulationWork.useGpuHaloExchange && !flags.doNeighborSearch;
flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
flags.haveGpuPmeOnThisRank = rankHasGpuPmeTask && flags.computeSlowForces;
flags.combineMtsForcesBeforeHaloExchange =
}
}
-/*! \brief Setup for the local and non-local GPU force reductions:
+/*! \brief Setup for the local GPU force reduction:
* reinitialization plus the registration of forces and dependencies.
*
- * \param [in] runScheduleWork Schedule workload flag structure
- * \param [in] cr Communication record object
- * \param [in] fr Force record object
+ * \param [in] runScheduleWork Schedule workload flag structure
+ * \param [in] nbv Non-bonded Verlet object
+ * \param [in] stateGpu GPU state propagator object
+ * \param [in] gpuForceReduction GPU force reduction object
+ * \param [in] pmePpCommGpu PME-PP GPU communication object
+ * \param [in] pmedata PME data object
+ * \param [in] dd Domain decomposition object
*/
-static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
- const t_commrec* cr,
- t_forcerec* fr)
+static void setupLocalGpuForceReduction(const gmx::MdrunScheduleWorkload* runScheduleWork,
+ const nonbonded_verlet_t* nbv,
+ gmx::StatePropagatorDataGpu* stateGpu,
+ gmx::GpuForceReduction* gpuForceReduction,
+ gmx::PmePpCommGpu* pmePpCommGpu,
+ const gmx_pme_t* pmedata,
+ const gmx_domdec_t* dd)
{
-
- nonbonded_verlet_t* nbv = fr->nbv.get();
- gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
+ GMX_ASSERT(!runScheduleWork->simulationWork.useMts,
+ "GPU force reduction is not compatible with MTS");
// (re-)initialize local GPU force reduction
const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
|| runScheduleWork->simulationWork.havePpDomainDecomposition;
const int atomStart = 0;
- fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
- nbv->getNumAtoms(AtomLocality::Local),
- nbv->getGridIndices(),
- atomStart,
- accumulate,
- stateGpu->fReducedOnDevice());
+ gpuForceReduction->reinit(stateGpu->getForces(),
+ nbv->getNumAtoms(AtomLocality::Local),
+ nbv->getGridIndices(),
+ atomStart,
+ accumulate,
+ stateGpu->fReducedOnDevice(AtomLocality::Local));
// register forces and add dependencies
- fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
-
- if (runScheduleWork->simulationWork.useGpuPme
- && (!runScheduleWork->simulationWork.haveSeparatePmeRank
- || runScheduleWork->simulationWork.useGpuPmePpCommunication))
- {
- DeviceBuffer<gmx::RVec> forcePtr =
- runScheduleWork->simulationWork.haveSeparatePmeRank
- ? fr->pmePpCommGpu->getGpuForceStagingPtr() // buffer received from other GPU
- : pme_gpu_get_device_f(fr->pmedata); // PME force buffer on same GPU
- fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
-
- GpuEventSynchronizer* const pmeSynchronizer =
- (runScheduleWork->simulationWork.haveSeparatePmeRank
- ? fr->pmePpCommGpu->getForcesReadySynchronizer() // buffer received from other GPU
- : pme_gpu_get_f_ready_synchronizer(fr->pmedata)); // PME force buffer on same GPU
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
+
+ DeviceBuffer<gmx::RVec> pmeForcePtr;
+ GpuEventSynchronizer* pmeSynchronizer = nullptr;
+ bool havePmeContribution = false;
+
+ if (runScheduleWork->simulationWork.useGpuPme && !runScheduleWork->simulationWork.haveSeparatePmeRank)
+ {
+ pmeForcePtr = pme_gpu_get_device_f(pmedata);
+ pmeSynchronizer = pme_gpu_get_f_ready_synchronizer(pmedata);
+ havePmeContribution = true;
+ }
+ else if (runScheduleWork->simulationWork.useGpuPmePpCommunication)
+ {
+ pmeForcePtr = pmePpCommGpu->getGpuForceStagingPtr();
if (GMX_THREAD_MPI)
{
- GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
- fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+ pmeSynchronizer = pmePpCommGpu->getForcesReadySynchronizer();
}
+ havePmeContribution = true;
}
- if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
+ if (havePmeContribution)
{
- // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
- if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
+ gpuForceReduction->registerRvecForce(pmeForcePtr);
+ if (!runScheduleWork->simulationWork.useGpuPmePpCommunication || GMX_THREAD_MPI)
{
- const bool useGpuForceBufferOps = true;
- fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
- stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::All, useGpuForceBufferOps));
+ GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
+ gpuForceReduction->addDependency(pmeSynchronizer);
}
}
- if (runScheduleWork->simulationWork.useGpuHaloExchange)
+ if (runScheduleWork->domainWork.haveCpuLocalForceWork
+ || (runScheduleWork->simulationWork.havePpDomainDecomposition
+ && !runScheduleWork->simulationWork.useGpuHaloExchange))
{
- fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
- cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::Local));
}
- if (runScheduleWork->simulationWork.havePpDomainDecomposition)
+ if (runScheduleWork->simulationWork.useGpuHaloExchange)
{
- // (re-)initialize non-local GPU force reduction
- const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
- || runScheduleWork->domainWork.haveFreeEnergyWork;
- const int atomStart = dd_numHomeAtoms(*cr->dd);
- fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(stateGpu->getForces(),
- nbv->getNumAtoms(AtomLocality::NonLocal),
- nbv->getGridIndices(),
- atomStart,
- accumulate);
+ gpuForceReduction->addDependency(dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
+ }
+}
- // register forces and add dependencies
- // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
- fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(
- Nbnxm::gpu_get_f(nbv->gpu_nbv));
+/*! \brief Setup for the non-local GPU force reduction:
+ * reinitialization plus the registration of forces and dependencies.
+ *
+ * \param [in] runScheduleWork Schedule workload flag structure
+ * \param [in] nbv Non-bonded Verlet object
+ * \param [in] stateGpu GPU state propagator object
+ * \param [in] gpuForceReduction GPU force reduction object
+ * \param [in] dd Domain decomposition object
+ */
+static void setupNonLocalGpuForceReduction(const gmx::MdrunScheduleWorkload* runScheduleWork,
+ const nonbonded_verlet_t* nbv,
+ gmx::StatePropagatorDataGpu* stateGpu,
+ gmx::GpuForceReduction* gpuForceReduction,
+ const gmx_domdec_t* dd)
+{
+ // (re-)initialize non-local GPU force reduction
+ const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
+ || runScheduleWork->domainWork.haveFreeEnergyWork;
+ const int atomStart = dd_numHomeAtoms(*dd);
+ gpuForceReduction->reinit(stateGpu->getForces(),
+ nbv->getNumAtoms(AtomLocality::NonLocal),
+ nbv->getGridIndices(),
+ atomStart,
+ accumulate,
+ stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
+
+ // register forces and add dependencies
+ gpuForceReduction->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
+
+ if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
+ {
+ gpuForceReduction->addDependency(stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
}
}
rvec muTotal,
double t,
gmx_edsam* ed,
+ CpuPpLongRangeNonbondeds* longRangeNonbondeds,
int legacyFlags,
const DDBalanceRegionHandler& ddBalanceRegionHandler)
{
runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork);
const StepWorkload& stepWork = runScheduleWork->stepWork;
+ if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork)
+ {
+ // GPU Force halo exchange will set a subset of local atoms with remote non-local data
+ // First clear local portion of force array, so that untouched atoms are zero.
+ // The dependency for this is that forces from previous timestep have been consumed,
+ // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked.
+ stateGpu->clearForcesOnGpu(AtomLocality::Local,
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
+ AtomLocality::Local, simulationWork, stepWork));
+ }
+
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
* decomposition (and not during the previous step as usual).
}
const bool fillGrid = (stepWork.doNeighborSearch && stepWork.stateChanged);
- const bool calcCGCM = (fillGrid && !DOMAINDECOMP(cr));
+ const bool calcCGCM = (fillGrid && !haveDDAtomOrdering(*cr));
if (calcCGCM)
{
put_atoms_in_box_omp(fr->pbcType,
haveCopiedXFromGpu = true;
}
+ if (stepWork.doNeighborSearch && ((stepWork.haveGpuPmeOnThisRank || simulationWork.useGpuBufferOps)))
+ {
+ // TODO refactor this to do_md, after partitioning.
+ stateGpu->reinit(mdatoms->homenr,
+ getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
+ if (stepWork.haveGpuPmeOnThisRank)
+ {
+ // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
+ pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
+ }
+ }
+
// Coordinates on the device are needed if PME or BufferOps are offloaded.
// The local coordinates can be copied right away.
// NOTE: Consider moving this copy to right after they are updated and constrained,
// if the later is not offloaded.
if (stepWork.haveGpuPmeOnThisRank || stepWork.useGpuXBufferOps)
{
- if (stepWork.doNeighborSearch)
- {
- // TODO refactor this to do_md, after partitioning.
- stateGpu->reinit(mdatoms->homenr,
- getLocalAtomCount(cr->dd, *mdatoms, simulationWork.havePpDomainDecomposition));
- if (stepWork.haveGpuPmeOnThisRank)
- {
- // TODO: This should be moved into PME setup function ( pme_gpu_prepare_computation(...) )
- pme_gpu_set_device_x(fr->pmedata, stateGpu->getCoordinates());
- }
- }
// We need to copy coordinates when:
// 1. Update is not offloaded
// 2. The buffers were reinitialized on search step
{
GMX_ASSERT(stateGpu != nullptr, "stateGpu should not be null");
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+ if (stepWork.doNeighborSearch)
+ {
+ /* On NS steps, we skip X buffer ops. So, unless we use PME or direct GPU
+ * communications, we don't wait for the coordinates on the device,
+ * and we must consume the event here.
+ * Issue #3988. */
+ const bool eventWillBeConsumedByGpuPme = stepWork.haveGpuPmeOnThisRank;
+ const bool eventWillBeConsumedByGpuPmePPComm =
+ (simulationWork.haveSeparatePmeRank && stepWork.computeSlowForces)
+ && pmeSendCoordinatesFromGpu;
+ if (!eventWillBeConsumedByGpuPme && !eventWillBeConsumedByGpuPmePPComm)
+ {
+ stateGpu->consumeCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
+ }
+ }
}
}
}
wallcycle_start(wcycle, WallCycleCounter::NS);
- if (!DOMAINDECOMP(cr))
+ if (!haveDDAtomOrdering(*cr))
{
const rvec vzero = { 0.0_real, 0.0_real, 0.0_real };
const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] };
wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
wallcycle_stop(wcycle, WallCycleCounter::NS);
- if (stepWork.useGpuXBufferOps)
+ if (simulationWork.useGpuBufferOps)
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
if (simulationWork.useGpuBufferOps)
{
- setupGpuForceReductions(runScheduleWork, cr, fr);
+ setupLocalGpuForceReduction(runScheduleWork,
+ fr->nbv.get(),
+ stateGpu,
+ fr->gpuForceReduction[gmx::AtomLocality::Local].get(),
+ fr->pmePpCommGpu.get(),
+ fr->pmedata,
+ cr->dd);
+ if (runScheduleWork->simulationWork.havePpDomainDecomposition)
+ {
+ setupNonLocalGpuForceReduction(runScheduleWork,
+ fr->nbv.get(),
+ stateGpu,
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal].get(),
+ cr->dd);
+ }
}
}
else if (!EI_TPI(inputrec.eI) && stepWork.computeNonbondedForces)
wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
- if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
+ if (!stepWork.useGpuXBufferOps)
{
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
}
}
else
{
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
if (stepWork.useGpuXHalo)
{
// The following must be called after local setCoordinates (which records an event
// when the coordinate data has been copied to the device).
- communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
+ gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
{
// non-local part of coordinate buffer must be copied back to host for CPU work
- stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
+ stateGpu->copyCoordinatesFromGpu(
+ x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
}
}
else
if (stepWork.useGpuXBufferOps)
{
- if (!stepWork.haveGpuPmeOnThisRank && !stepWork.useGpuXHalo)
+ if (!stepWork.useGpuXHalo)
{
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
}
- nbv->convertCoordinatesGpu(AtomLocality::NonLocal,
- stateGpu->getCoordinates(),
- stateGpu->getCoordinatesReadyOnDeviceEvent(
- AtomLocality::NonLocal, simulationWork, stepWork));
+ nbv->convertCoordinatesGpu(
+ AtomLocality::NonLocal,
+ stateGpu->getCoordinates(),
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
+ AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched));
}
else
{
if (simulationWork.useGpuNonbonded)
{
- if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
+ if (!stepWork.useGpuXBufferOps)
{
wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
}
}
+ // With FEP we set up the reduction over threads for local+non-local simultaneously,
+ // so we need to do that here after the local and non-local pairlist construction.
+ if (stepWork.doNeighborSearch && fr->efep != FreeEnergyPerturbationType::No)
+ {
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep);
+ nbv->setupFepThreadedForceBuffer(fr->natoms_force_constr);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep);
+ }
+
if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
{
/* launch D2H copy-back F */
/* Reset energies */
reset_enerdata(enerd);
- if (DOMAINDECOMP(cr) && simulationWork.haveSeparatePmeRank)
+ if (haveDDAtomOrdering(*cr) && simulationWork.haveSeparatePmeRank)
{
wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
dd_force_flop_start(cr->dd, nrnb);
/* Calculate the local and non-local free energy interactions here.
* Happens here on the CPU both with and without GPU.
*/
- nbv->dispatchFreeEnergyKernel(
- InteractionLocality::Local,
- x.unpaddedArrayRef(),
+ nbv->dispatchFreeEnergyKernels(
+ x,
&forceOutNonbonded->forceWithShiftForces(),
fr->use_simd_kernels,
fr->ntype,
enerd,
stepWork,
nrnb);
-
- if (simulationWork.havePpDomainDecomposition)
- {
- nbv->dispatchFreeEnergyKernel(
- InteractionLocality::NonLocal,
- x.unpaddedArrayRef(),
- &forceOutNonbonded->forceWithShiftForces(),
- fr->use_simd_kernels,
- fr->ntype,
- fr->rlist,
- *fr->ic,
- fr->shift_vec,
- fr->nbfp,
- fr->ljpme_c6grid,
- mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
- : gmx::ArrayRef<real>{},
- mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
- : gmx::ArrayRef<real>{},
- mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr)
- : gmx::ArrayRef<int>{},
- mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr)
- : gmx::ArrayRef<int>{},
- inputrec.fepvals.get(),
- lambda,
- enerd,
- stepWork,
- nrnb);
- }
}
if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
/* Since all atoms are in the rectangular or triclinic unit-cell,
* only single box vector shifts (2 in x) are required.
*/
- set_pbc_dd(&pbc, fr->pbcType, DOMAINDECOMP(cr) ? cr->dd->numCells : nullptr, TRUE, box);
+ set_pbc_dd(&pbc, fr->pbcType, haveDDAtomOrdering(*cr) ? cr->dd->numCells : nullptr, TRUE, box);
}
for (int mtsIndex = 0; mtsIndex < (simulationWork.useMts && stepWork.computeSlowForces ? 2 : 1);
nrnb,
lambda,
mdatoms,
- DOMAINDECOMP(cr) ? cr->dd->globalAtomIndices.data() : nullptr,
+ haveDDAtomOrdering(*cr) ? cr->dd->globalAtomIndices.data() : nullptr,
stepWork);
}
}
if (stepWork.computeSlowForces)
{
- calculateLongRangeNonbondeds(fr,
- inputrec,
- cr,
- nrnb,
- wcycle,
- mdatoms,
- x.unpaddedConstArrayRef(),
- &forceOutMtsLevel1->forceWithVirial(),
- enerd,
- box,
- lambda,
- dipoleData.muStateAB,
- stepWork,
- ddBalanceRegionHandler);
+ longRangeNonbondeds->calculate(fr->pmedata,
+ cr,
+ x.unpaddedConstArrayRef(),
+ &forceOutMtsLevel1->forceWithVirial(),
+ enerd,
+ box,
+ lambda,
+ dipoleData.muStateAB,
+ stepWork,
+ ddBalanceRegionHandler);
}
wallcycle_stop(wcycle, WallCycleCounter::Force);
if (!stepWork.useGpuFHalo)
{
+ /* We don't explicitly wait for the forces to be reduced on device,
+ * but wait for them to finish copying to CPU instead.
+ * So, we manually consume the event, see Issue #3988. */
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::NonLocal);
// copy from GPU input for dd_move_f()
stateGpu->copyForcesFromGpu(forceOutMtsLevel0.forceWithShiftForces().force(),
AtomLocality::NonLocal);
{
// If there exist CPU forces, data from halo exchange should accumulate into these
bool accumulateForces = domainWork.haveCpuLocalForceWork;
- if (!accumulateForces)
- {
- // Force halo exchange will set a subset of local atoms with remote non-local data
- // First clear local portion of force array, so that untouched atoms are zero
- stateGpu->clearForcesOnGpu(AtomLocality::Local);
- }
- communicateGpuHaloForces(*cr, accumulateForces);
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
+ gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
+ gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
+
+ communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
}
else
{
// With both nonbonded and PME offloaded a GPU on the same rank, we use
// an alternating wait/reduction scheme.
bool alternateGpuWait =
- (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank
- && simulationWork.useGpuNonbonded && !DOMAINDECOMP(cr) && !stepWork.useGpuFBufferOps);
+ (!c_disableAlternatingWait && stepWork.haveGpuPmeOnThisRank && simulationWork.useGpuNonbonded
+ && !simulationWork.havePpDomainDecomposition && !stepWork.useGpuFBufferOps);
+
if (alternateGpuWait)
{
alternatePmeNbGpuWaitReduce(fr->nbv.get(),
enerd,
stepWork,
InteractionLocality::Local,
- DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
+ haveDDAtomOrdering(*cr) ? enbvClearFNo : enbvClearFYes,
step,
nrnb,
wcycle);
// These should be unified.
if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
{
- // Note: AtomLocality::All is used for the non-DD case because, as in this
- // case copyForcesToGpu() uses a separate stream, it allows overlap of
- // CPU force H2D with GPU force tasks on all streams including those in the
- // local stream which would otherwise be implicit dependencies for the
- // transfer and would not overlap.
- auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
- : AtomLocality::All;
-
- stateGpu->copyForcesToGpu(forceWithShift, locality);
+ stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
}
if (stepWork.computeNonbondedForces)
// NOTE: If there are virtual sites, the forces are modified on host after this D2H copy. Hence,
// they should not be copied in do_md(...) for the output.
if (!simulationWork.useGpuUpdate
- || (simulationWork.useGpuUpdate && DOMAINDECOMP(cr) && simulationWork.useCpuPmePpCommunication)
+ || (simulationWork.useGpuUpdate && haveDDAtomOrdering(*cr) && simulationWork.useCpuPmePpCommunication)
|| vsite)
{
+ if (stepWork.computeNonbondedForces)
+ {
+ /* We have previously issued force reduction on the GPU, but we will
+ * not use this event, instead relying on the stream being in-order.
+ * Issue #3988. */
+ stateGpu->consumeForcesReducedOnDeviceEvent(AtomLocality::Local);
+ }
stateGpu->copyForcesFromGpu(forceWithShift, AtomLocality::Local);
stateGpu->waitForcesReadyOnHost(AtomLocality::Local);
}
launchGpuEndOfStepTasks(
nbv, fr->listedForcesGpu.get(), fr->pmedata, enerd, *runScheduleWork, step, wcycle);
- if (DOMAINDECOMP(cr))
+ if (haveDDAtomOrdering(*cr))
{
dd_force_flop_stop(cr->dd, nrnb);
}