#include "gromacs/domdec/partition.h"
#include "gromacs/essentialdynamics/edsam.h"
#include "gromacs/ewald/pme.h"
+#include "gromacs/ewald/pme_coordinate_receiver_gpu.h"
#include "gromacs/ewald/pme_pp.h"
#include "gromacs/ewald/pme_pp_comm_gpu.h"
#include "gromacs/gmxlib/network.h"
gmx_wallcycle* wcycle)
{
pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
- pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
+ bool useGpuDirectComm = false;
+ gmx::PmeCoordinateReceiverGpu* pmeCoordinateReceiverGpu = nullptr;
+ pme_gpu_launch_spread(
+ pmedata, xReadyOnDevice, wcycle, lambdaQ, useGpuDirectComm, pmeCoordinateReceiverGpu);
}
/*! \brief Launch the FFT and gather stages of PME GPU
const bool rankHasGpuPmeTask = simulationWork.useGpuPme && !simulationWork.haveSeparatePmeRank;
flags.useGpuPmeFReduction = flags.computeSlowForces && flags.useGpuFBufferOps
&& (rankHasGpuPmeTask || simulationWork.useGpuPmePpCommunication);
- flags.useGpuXHalo = simulationWork.useGpuHaloExchange;
+ flags.useGpuXHalo = simulationWork.useGpuHaloExchange && !flags.doNeighborSearch;
flags.useGpuFHalo = simulationWork.useGpuHaloExchange && flags.useGpuFBufferOps;
flags.haveGpuPmeOnThisRank = rankHasGpuPmeTask && flags.computeSlowForces;
flags.combineMtsForcesBeforeHaloExchange =
const bool accumulate = runScheduleWork->domainWork.haveCpuLocalForceWork
|| runScheduleWork->simulationWork.havePpDomainDecomposition;
const int atomStart = 0;
- fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(stateGpu->getForces(),
- nbv->getNumAtoms(AtomLocality::Local),
- nbv->getGridIndices(),
- atomStart,
- accumulate,
- stateGpu->fReducedOnDevice());
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
+ stateGpu->getForces(),
+ nbv->getNumAtoms(AtomLocality::Local),
+ nbv->getGridIndices(),
+ atomStart,
+ accumulate,
+ stateGpu->fReducedOnDevice(AtomLocality::Local));
// register forces and add dependencies
fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(Nbnxm::gpu_get_f(nbv->gpu_nbv));
: pme_gpu_get_device_f(fr->pmedata); // PME force buffer on same GPU
fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
- GpuEventSynchronizer* const pmeSynchronizer =
- (runScheduleWork->simulationWork.haveSeparatePmeRank
- ? fr->pmePpCommGpu->getForcesReadySynchronizer() // buffer received from other GPU
- : pme_gpu_get_f_ready_synchronizer(fr->pmedata)); // PME force buffer on same GPU
- if (GMX_THREAD_MPI)
+ if (runScheduleWork->simulationWork.haveSeparatePmeRank)
{
+ // PME force buffer on remote GPU -
+ // event synchronizer received from other GPU only in case of thread-mpi
+ if (GMX_THREAD_MPI)
+ {
+ GpuEventSynchronizer* const pmeSynchronizer =
+ fr->pmePpCommGpu->getForcesReadySynchronizer();
+ GMX_ASSERT(pmeSynchronizer != nullptr,
+ "PME force ready cuda event should not be NULL");
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+ }
+ }
+ else
+ {
+ // PME force buffer on same GPU - add dependency on PME force computation
+ GpuEventSynchronizer* const pmeSynchronizer = pme_gpu_get_f_ready_synchronizer(fr->pmedata);
GMX_ASSERT(pmeSynchronizer != nullptr, "PME force ready cuda event should not be NULL");
fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
}
}
- if (runScheduleWork->domainWork.haveCpuLocalForceWork && !runScheduleWork->simulationWork.useGpuHaloExchange)
+ if (runScheduleWork->domainWork.haveCpuLocalForceWork
+ || (runScheduleWork->simulationWork.havePpDomainDecomposition
+ && !runScheduleWork->simulationWork.useGpuHaloExchange))
{
- // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
- if (!runScheduleWork->simulationWork.havePpDomainDecomposition)
- {
- const bool useGpuForceBufferOps = true;
- fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
- stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::All, useGpuForceBufferOps));
- }
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+ stateGpu->fReadyOnDevice(AtomLocality::Local));
}
if (runScheduleWork->simulationWork.useGpuHaloExchange)
const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
|| runScheduleWork->domainWork.haveFreeEnergyWork;
const int atomStart = dd_numHomeAtoms(*cr->dd);
- fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(stateGpu->getForces(),
- nbv->getNumAtoms(AtomLocality::NonLocal),
- nbv->getGridIndices(),
- atomStart,
- accumulate);
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
+ stateGpu->getForces(),
+ nbv->getNumAtoms(AtomLocality::NonLocal),
+ nbv->getGridIndices(),
+ atomStart,
+ accumulate,
+ stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
// register forces and add dependencies
- // in the DD case we use the same stream for H2D and reduction, hence no explicit dependency needed
fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(
Nbnxm::gpu_get_f(nbv->gpu_nbv));
+
+ if (runScheduleWork->domainWork.haveNonLocalForceContribInCpuBuffer)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
+ stateGpu->fReadyOnDevice(AtomLocality::NonLocal));
+ }
}
}
runScheduleWork->stepWork = setupStepWorkload(legacyFlags, inputrec.mtsLevels, step, simulationWork);
const StepWorkload& stepWork = runScheduleWork->stepWork;
+ if (stepWork.useGpuFHalo && !runScheduleWork->domainWork.haveCpuLocalForceWork)
+ {
+ // GPU Force halo exchange will set a subset of local atoms with remote non-local data
+ // First clear local portion of force array, so that untouched atoms are zero.
+ // The dependency for this is that forces from previous timestep have been consumed,
+ // which is satisfied when getCoordinatesReadyOnDeviceEvent has been marked.
+ stateGpu->clearForcesOnGpu(AtomLocality::Local,
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
+ AtomLocality::Local, simulationWork, stepWork));
+ }
+
/* At a search step we need to start the first balancing region
* somewhere early inside the step after communication during domain
* decomposition (and not during the previous step as usual).
}
else
{
+ GpuEventSynchronizer* gpuCoordinateHaloLaunched = nullptr;
if (stepWork.useGpuXHalo)
{
// The following must be called after local setCoordinates (which records an event
// when the coordinate data has been copied to the device).
- communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
+ gpuCoordinateHaloLaunched = communicateGpuHaloCoordinates(*cr, box, localXReadyOnDevice);
if (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork)
{
// non-local part of coordinate buffer must be copied back to host for CPU work
- stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
+ stateGpu->copyCoordinatesFromGpu(
+ x.unpaddedArrayRef(), AtomLocality::NonLocal, gpuCoordinateHaloLaunched);
}
}
else
{
stateGpu->copyCoordinatesToGpu(x.unpaddedArrayRef(), AtomLocality::NonLocal);
}
- nbv->convertCoordinatesGpu(AtomLocality::NonLocal,
- stateGpu->getCoordinates(),
- stateGpu->getCoordinatesReadyOnDeviceEvent(
- AtomLocality::NonLocal, simulationWork, stepWork));
+ nbv->convertCoordinatesGpu(
+ AtomLocality::NonLocal,
+ stateGpu->getCoordinates(),
+ stateGpu->getCoordinatesReadyOnDeviceEvent(
+ AtomLocality::NonLocal, simulationWork, stepWork, gpuCoordinateHaloLaunched));
}
else
{
}
}
+ // With FEP we set up the reduction over threads for local+non-local simultaneously,
+ // so we need to do that here after the local and non-local pairlist construction.
+ if (stepWork.doNeighborSearch && fr->efep != FreeEnergyPerturbationType::No)
+ {
+ wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedFep);
+ nbv->setupFepThreadedForceBuffer(fr->natoms_force_constr);
+ wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedFep);
+ }
+
if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
{
/* launch D2H copy-back F */
/* Calculate the local and non-local free energy interactions here.
* Happens here on the CPU both with and without GPU.
*/
- nbv->dispatchFreeEnergyKernel(
- InteractionLocality::Local,
- x.unpaddedArrayRef(),
+ nbv->dispatchFreeEnergyKernels(
+ x,
&forceOutNonbonded->forceWithShiftForces(),
fr->use_simd_kernels,
fr->ntype,
enerd,
stepWork,
nrnb);
-
- if (simulationWork.havePpDomainDecomposition)
- {
- nbv->dispatchFreeEnergyKernel(
- InteractionLocality::NonLocal,
- x.unpaddedArrayRef(),
- &forceOutNonbonded->forceWithShiftForces(),
- fr->use_simd_kernels,
- fr->ntype,
- fr->rlist,
- *fr->ic,
- fr->shift_vec,
- fr->nbfp,
- fr->ljpme_c6grid,
- mdatoms->chargeA ? gmx::arrayRefFromArray(mdatoms->chargeA, mdatoms->nr)
- : gmx::ArrayRef<real>{},
- mdatoms->chargeB ? gmx::arrayRefFromArray(mdatoms->chargeB, mdatoms->nr)
- : gmx::ArrayRef<real>{},
- mdatoms->typeA ? gmx::arrayRefFromArray(mdatoms->typeA, mdatoms->nr)
- : gmx::ArrayRef<int>{},
- mdatoms->typeB ? gmx::arrayRefFromArray(mdatoms->typeB, mdatoms->nr)
- : gmx::ArrayRef<int>{},
- inputrec.fepvals.get(),
- lambda,
- enerd,
- stepWork,
- nrnb);
- }
}
if (stepWork.computeNonbondedForces && !useOrEmulateGpuNb)
{
// If there exist CPU forces, data from halo exchange should accumulate into these
bool accumulateForces = domainWork.haveCpuLocalForceWork;
- if (!accumulateForces)
- {
- // Force halo exchange will set a subset of local atoms with remote non-local data
- // First clear local portion of force array, so that untouched atoms are zero
- stateGpu->clearForcesOnGpu(AtomLocality::Local);
- }
- communicateGpuHaloForces(*cr, accumulateForces);
+ gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> gpuForceHaloDependencies;
+ gpuForceHaloDependencies.push_back(stateGpu->fReadyOnDevice(AtomLocality::Local));
+ gpuForceHaloDependencies.push_back(stateGpu->fReducedOnDevice(AtomLocality::NonLocal));
+
+ communicateGpuHaloForces(*cr, accumulateForces, &gpuForceHaloDependencies);
}
else
{
// These should be unified.
if (domainWork.haveLocalForceContribInCpuBuffer && !stepWork.useGpuFHalo)
{
- // Note: AtomLocality::All is used for the non-DD case because, as in this
- // case copyForcesToGpu() uses a separate stream, it allows overlap of
- // CPU force H2D with GPU force tasks on all streams including those in the
- // local stream which would otherwise be implicit dependencies for the
- // transfer and would not overlap.
- auto locality = simulationWork.havePpDomainDecomposition ? AtomLocality::Local
- : AtomLocality::All;
-
- stateGpu->copyForcesToGpu(forceWithShift, locality);
+ stateGpu->copyForcesToGpu(forceWithShift, AtomLocality::Local);
}
if (stepWork.computeNonbondedForces)