#include "gromacs/utility/strconvert.h"
#include "gromacs/utility/sysinfo.h"
+#include "gpuforcereduction.h"
+
using gmx::ArrayRef;
using gmx::AtomLocality;
using gmx::DomainLifetimeWorkload;
}
}
+
+/*! \brief Setup for the local and non-local GPU force reductions:
+ * reinitialization plus the registration of forces and dependencies.
+ *
+ * \param [in] runScheduleWork Schedule workload flag structure
+ * \param [in] cr Communication record object
+ * \param [in] fr Force record object
+ * \param [in] ddUsesGpuDirectCommunication Whether GPU direct communication is in use
+ */
+static void setupGpuForceReductions(gmx::MdrunScheduleWorkload* runScheduleWork,
+ const t_commrec* cr,
+ t_forcerec* fr,
+ bool ddUsesGpuDirectCommunication)
+{
+
+ nonbonded_verlet_t* nbv = fr->nbv.get();
+ gmx::StatePropagatorDataGpu* stateGpu = fr->stateGpu;
+
+ // (re-)initialize local GPU force reduction
+ const bool accumulate =
+ runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr);
+ const int atomStart = 0;
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->reinit(
+ stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::Local), nbv->getGridIndices(),
+ atomStart, accumulate, stateGpu->fReducedOnDevice());
+
+ // register forces and add dependencies
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->registerNbnxmForce(nbv->getGpuForces());
+
+ if (runScheduleWork->simulationWork.useGpuPme
+ && (thisRankHasDuty(cr, DUTY_PME) || runScheduleWork->simulationWork.useGpuPmePpCommunication))
+ {
+ void* forcePtr = thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
+ : // PME force buffer on same GPU
+ fr->pmePpCommGpu->getGpuForceStagingPtr(); // buffer received from other GPU
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->registerRvecForce(forcePtr);
+
+ GpuEventSynchronizer* const pmeSynchronizer =
+ (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
+ : // PME force buffer on same GPU
+ fr->pmePpCommGpu->getForcesReadySynchronizer()); // buffer received from other GPU
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(pmeSynchronizer);
+ }
+
+ if ((runScheduleWork->domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr))
+ && !ddUsesGpuDirectCommunication)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+ stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::Local, true));
+ }
+
+ if (ddUsesGpuDirectCommunication)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->addDependency(
+ cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
+ }
+
+ if (havePPDomainDecomposition(cr))
+ {
+ // (re-)initialize non-local GPU force reduction
+ const bool accumulate = runScheduleWork->domainWork.haveCpuBondedWork
+ || runScheduleWork->domainWork.haveFreeEnergyWork;
+ const int atomStart = dd_numHomeAtoms(*cr->dd);
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->reinit(
+ stateGpu->getForces(), nbv->getNumAtoms(AtomLocality::NonLocal),
+ nbv->getGridIndices(), atomStart, accumulate);
+
+ // register forces and add dependencies
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->registerNbnxmForce(nbv->getGpuForces());
+ if (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->addDependency(
+ stateGpu->getForcesReadyOnDeviceEvent(AtomLocality::NonLocal, true));
+ }
+ }
+}
+
+
void do_force(FILE* fplog,
const t_commrec* cr,
const gmx_multisim_t* ms,
launchPmeGpuSpread(fr->pmedata, box, stepWork, localXReadyOnDevice, lambda[efptCOUL], wcycle);
}
+ const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
+
/* do gridding for pair search */
if (stepWork.doNeighborSearch)
{
{
nbv->atomdata_init_copy_x_to_nbat_x_gpu();
}
- // For force buffer ops, we use the below conditon rather than
- // useGpuFBufferOps to ensure that init is performed even if this
- // NS step is also a virial step (on which f buf ops are deactivated).
- if (GMX_GPU_CUDA && simulationWork.useGpuBufferOps && simulationWork.useGpuNonbonded)
+
+ if (simulationWork.useGpuBufferOps)
{
- GMX_ASSERT(stateGpu, "stateGpu should be valid when buffer ops are offloaded");
- nbv->atomdata_init_add_nbat_f_to_f_gpu(stateGpu->fReducedOnDevice());
+ setupGpuForceReductions(runScheduleWork, cr, fr, ddUsesGpuDirectCommunication);
}
}
else if (!EI_TPI(inputrec->eI))
}
}
- const gmx::DomainLifetimeWorkload& domainWork = runScheduleWork->domainWork;
-
if (simulationWork.useGpuNonbonded)
{
ddBalanceRegionHandler.openBeforeForceComputationGpu();
if (stepWork.useGpuFBufferOps)
{
- gmx::FixedCapacityVector<GpuEventSynchronizer*, 1> dependencyList;
-
// TODO: move this into DomainLifetimeWorkload, including the second part of the
// condition The bonded and free energy CPU tasks can have non-local force
// contributions which are a dependency for the GPU force reduction.
{
stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(),
AtomLocality::NonLocal);
- dependencyList.push_back(stateGpu->getForcesReadyOnDeviceEvent(
- AtomLocality::NonLocal, stepWork.useGpuFBufferOps));
}
- nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::NonLocal, stateGpu->getForces(),
- pme_gpu_get_device_f(fr->pmedata), dependencyList,
- false, haveNonLocalForceContribInCpuBuffer);
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal]->execute();
+
if (!useGpuForcesHaloExchange)
{
// copy from GPU input for dd_move_f()
* on the non-alternating path. */
if (useOrEmulateGpuNb && !alternateGpuWait)
{
- // TODO simplify the below conditionals. Pass buffer and sync pointers at init stage rather than here. Unify getter fns for sameGPU/otherGPU cases.
- void* pmeForcePtr =
- stepWork.useGpuPmeFReduction
- ? (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_device_f(fr->pmedata)
- : // PME force buffer on same GPU
- fr->pmePpCommGpu->getGpuForceStagingPtr()) // buffer received from other GPU
- : nullptr; // PME reduction not active on GPU
-
- GpuEventSynchronizer* const pmeSynchronizer =
- stepWork.useGpuPmeFReduction
- ? (thisRankHasDuty(cr, DUTY_PME) ? pme_gpu_get_f_ready_synchronizer(fr->pmedata)
- : // PME force buffer on same GPU
- static_cast<GpuEventSynchronizer*>(
- fr->pmePpCommGpu->getForcesReadySynchronizer())) // buffer received from other GPU
- : nullptr; // PME reduction not active on GPU
-
- gmx::FixedCapacityVector<GpuEventSynchronizer*, 3> dependencyList;
-
- if (stepWork.useGpuPmeFReduction)
- {
- dependencyList.push_back(pmeSynchronizer);
- }
-
gmx::ArrayRef<gmx::RVec> forceWithShift = forceOut.forceWithShiftForces().force();
if (stepWork.useGpuFBufferOps)
auto locality = havePPDomainDecomposition(cr) ? AtomLocality::Local : AtomLocality::All;
stateGpu->copyForcesToGpu(forceWithShift, locality);
- dependencyList.push_back(
- stateGpu->getForcesReadyOnDeviceEvent(locality, stepWork.useGpuFBufferOps));
- }
- if (useGpuForcesHaloExchange)
- {
- dependencyList.push_back(cr->dd->gpuHaloExchange[0][0]->getForcesReadyOnDeviceEvent());
}
- nbv->atomdata_add_nbat_f_to_f_gpu(AtomLocality::Local, stateGpu->getForces(), pmeForcePtr,
- dependencyList, stepWork.useGpuPmeFReduction,
- haveLocalForceContribInCpuBuffer);
+
+ fr->gpuForceReduction[gmx::AtomLocality::Local]->execute();
+
// Copy forces to host if they are needed for update or if virtual sites are enabled.
// If there are vsites, we need to copy forces every step to spread vsite forces on host.
// TODO: When the output flags will be included in step workload, this copy can be combined with the