#include "gromacs/mdlib/force.h"
#include "gromacs/mdlib/forcerec.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
-#include "gromacs/mdlib/ppforceworkload.h"
#include "gromacs/mdlib/qmmm.h"
#include "gromacs/mdlib/update.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/iforceprovider.h"
#include "gromacs/mdtypes/inputrec.h"
#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/mdtypes/state.h"
#include "gromacs/nbnxm/atomdata.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/utility/sysinfo.h"
using gmx::ForceOutputs;
+using gmx::StepWorkload;
+using gmx::DomainLifetimeWorkload;
// TODO: this environment variable allows us to verify before release
// that on less common architectures the total cost of polling is not larger than
const t_graph *graph,
const t_forcerec *fr,
const gmx_vsite_t *vsite,
- const gmx::ForceFlags &forceFlags)
+ const StepWorkload &stepWork)
{
rvec *f = as_rvec_array(forceOutputs->forceWithShiftForces().force().data());
*/
matrix virial = { { 0 } };
spread_vsite_f(vsite, x, fDirectVir, nullptr,
- forceFlags.computeVirial, virial,
+ stepWork.computeVirial, virial,
nrnb,
&top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
forceWithVirial.addVirialContribution(virial);
}
- if (forceFlags.computeVirial)
+ if (stepWork.computeVirial)
{
/* Now add the forces, this is local */
sum_forces(f, forceWithVirial.force_);
static void do_nb_verlet(t_forcerec *fr,
const interaction_const_t *ic,
gmx_enerdata_t *enerd,
- const gmx::ForceFlags &forceFlags,
+ const StepWorkload &stepWork,
const Nbnxm::InteractionLocality ilocality,
const int clearF,
const int64_t step,
t_nrnb *nrnb,
gmx_wallcycle_t wcycle)
{
- if (!forceFlags.computeNonbondedForces)
+ if (!stepWork.computeNonbondedForces)
{
/* skip non-bonded calculation */
return;
}
}
- nbv->dispatchNonbondedKernel(ilocality, *ic, forceFlags, clearF, *fr, enerd, nrnb);
+ nbv->dispatchNonbondedKernel(ilocality, *ic, stepWork, clearF, *fr, enerd, nrnb);
}
static inline void clear_rvecs_omp(int n, rvec v[])
* \param[in] x The coordinates
* \param[in] mdatoms Per atom properties
* \param[in] lambda Array of free-energy lambda values
- * \param[in] forceFlags Force schedule flags
+ * \param[in] stepWork Step schedule flags
* \param[in,out] forceWithVirial Force and virial buffers
* \param[in,out] enerd Energy buffer
* \param[in,out] ed Essential dynamics pointer
gmx::ArrayRef<const gmx::RVec> x,
const t_mdatoms *mdatoms,
real *lambda,
- const gmx::ForceFlags &forceFlags,
+ const StepWorkload &stepWork,
gmx::ForceWithVirial *forceWithVirial,
gmx_enerdata_t *enerd,
gmx_edsam *ed,
/* NOTE: Currently all ForceProviders only provide forces.
* When they also provide energies, remove this conditional.
*/
- if (forceFlags.computeForces)
+ if (stepWork.computeForces)
{
gmx::ForceProviderInput forceProviderInput(x, *mdatoms, t, box, *cr);
gmx::ForceProviderOutput forceProviderOutput(forceWithVirial, enerd);
}
/* Add forces from interactive molecular dynamics (IMD), if any */
- if (inputrec->bIMD && forceFlags.computeForces)
+ if (inputrec->bIMD && stepWork.computeForces)
{
imdSession->applyForces(f);
}
* \param[in] pmedata The PME structure
* \param[in] box The box matrix
* \param[in] x Coordinate array
- * \param[in] forceFlags Force schedule flags
+ * \param[in] stepWork Step schedule flags
* \param[in] pmeFlags PME flags
* \param[in] useGpuForceReduction True if GPU-based force reduction is active this step
* \param[in] wcycle The wallcycle structure
*/
-static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
- const matrix box,
- const rvec x[],
- const gmx::ForceFlags &forceFlags,
- int pmeFlags,
- bool useGpuForceReduction,
- gmx_wallcycle_t wcycle)
+static inline void launchPmeGpuSpread(gmx_pme_t *pmedata,
+ const matrix box,
+ const rvec x[],
+ const StepWorkload &stepWork,
+ int pmeFlags,
+ bool useGpuForceReduction,
+ gmx_wallcycle_t wcycle)
{
- pme_gpu_prepare_computation(pmedata, forceFlags.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
+ pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
pme_gpu_copy_coordinates_to_gpu(pmedata, x, wcycle);
pme_gpu_launch_spread(pmedata, wcycle);
}
* \param[in,out] pmedata PME module data
* \param[in,out] forceOutputs Output buffer for the forces and virial
* \param[in,out] enerd Energy data structure results are reduced into
- * \param[in] forceFlags Force schedule flags
+ * \param[in] stepWork Step schedule flags
* \param[in] pmeFlags PME flags
* \param[in] wcycle The wallcycle structure
*/
-static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
- gmx_pme_t *pmedata,
- gmx::ForceOutputs *forceOutputs,
- gmx_enerdata_t *enerd,
- const gmx::ForceFlags &forceFlags,
- int pmeFlags,
- gmx_wallcycle_t wcycle)
+static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t *nbv,
+ gmx_pme_t *pmedata,
+ gmx::ForceOutputs *forceOutputs,
+ gmx_enerdata_t *enerd,
+ const StepWorkload &stepWork,
+ int pmeFlags,
+ gmx_wallcycle_t wcycle)
{
bool isPmeGpuDone = false;
bool isNbGpuDone = false;
{
GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
- forceFlags,
+ stepWork,
Nbnxm::AtomLocality::Local,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
* \param[in] pull_work The pull work object.
* \param[in] inputrec input record
* \param[in] force force array
- * \param[in] forceFlags Force schedule flags
+ * \param[in] stepWork Step schedule flags
* \param[out] wcycle wallcycle recording structure
*
* \returns Cleared force output structure
pull_t *pull_work,
const t_inputrec &inputrec,
gmx::ArrayRefWithPadding<gmx::RVec> force,
- const gmx::ForceFlags &forceFlags,
+ const StepWorkload &stepWork,
gmx_wallcycle_t wcycle)
{
wallcycle_sub_start(wcycle, ewcsCLEAR_FORCE_BUFFER);
/* NOTE: We assume fr->shiftForces is all zeros here */
- gmx::ForceWithShiftForces forceWithShiftForces(force, forceFlags.computeVirial, fr->shiftForces);
+ gmx::ForceWithShiftForces forceWithShiftForces(force, stepWork.computeVirial, fr->shiftForces);
- if (forceFlags.computeForces)
+ if (stepWork.computeForces)
{
/* Clear the short- and long-range forces */
clear_rvecs_omp(fr->natoms_force_constr,
* directly, such as PME. Otherwise, forceWithVirial uses the
* the same force (f in legacy calls) buffer as other algorithms.
*/
- const bool useSeparateForceWithVirialBuffer = (forceFlags.computeForces &&
- (forceFlags.computeVirial && fr->haveDirectVirialContributions));
+ const bool useSeparateForceWithVirialBuffer = (stepWork.computeForces &&
+ (stepWork.computeVirial && fr->haveDirectVirialContributions));
/* forceWithVirial uses the local atom range only */
gmx::ForceWithVirial forceWithVirial(useSeparateForceWithVirialBuffer ?
fr->forceBufferForDirectVirialContributions : force.unpaddedArrayRef(),
- forceFlags.computeVirial);
+ stepWork.computeVirial);
if (useSeparateForceWithVirialBuffer)
{
}
-/*! \brief Set up flags that indicate what type of work is there to compute.
- *
- * Currently we only update it at search steps,
- * but some properties may change more frequently (e.g. virial/non-virial step),
- * so when including those either the frequency of update (per-step) or the scope
- * of a flag will change (i.e. a set of flags for nstlist steps).
- *
+/*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
*/
static void
-setupForceWorkload(gmx::PpForceWorkload *forceWork,
- const t_inputrec *inputrec,
- const t_forcerec *fr,
- const pull_t *pull_work,
- const gmx_edsam *ed,
- const t_idef &idef,
- const t_fcdata *fcd,
- const gmx::ForceFlags &forceFlags
- )
+setupDomainLifetimeWorkload(DomainLifetimeWorkload *domainWork,
+ const t_inputrec *inputrec,
+ const t_forcerec *fr,
+ const pull_t *pull_work,
+ const gmx_edsam *ed,
+ const t_idef &idef,
+ const t_fcdata *fcd,
+ const StepWorkload &stepWork)
{
- forceWork->haveSpecialForces = haveSpecialForces(inputrec, fr->forceProviders, pull_work, forceFlags.computeForces, ed);
- forceWork->haveCpuBondedWork = haveCpuBondeds(*fr);
- forceWork->haveGpuBondedWork = ((fr->gpuBonded != nullptr) && fr->gpuBonded->haveInteractions());
- forceWork->haveRestraintsWork = havePositionRestraints(idef, *fcd);
- forceWork->haveCpuListedForceWork = haveCpuListedForces(*fr, idef, *fcd);
+ domainWork->haveSpecialForces = haveSpecialForces(inputrec, fr->forceProviders, pull_work, stepWork.computeForces, ed);
+ domainWork->haveCpuBondedWork = haveCpuBondeds(*fr);
+ domainWork->haveGpuBondedWork = ((fr->gpuBonded != nullptr) && fr->gpuBonded->haveInteractions());
+ domainWork->haveRestraintsWork = havePositionRestraints(idef, *fcd);
+ domainWork->haveCpuListedForceWork = haveCpuListedForces(*fr, idef, *fcd);
}
/*! \brief Set up force flag stuct from the force bitmask.
* \param[in] isNonbondedOn Global override, if false forces to turn off all nonbonded calculation.
*/
static void
-setupForceFlags(gmx::ForceFlags *flags,
- const int legacyFlags,
- const bool isNonbondedOn)
+setupStepWorkload(StepWorkload *flags,
+ const int legacyFlags,
+ const bool isNonbondedOn)
{
flags->stateChanged = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
flags->haveDynamicBox = ((legacyFlags & GMX_FORCE_DYNAMICBOX) != 0);
/* \brief Launch end-of-step GPU tasks: buffer clearing and rolling pruning.
*
* TODO: eliminate the \p useGpuNonbonded and \p useGpuNonbonded when these are
- * incorporated in PpForceWorkload.
+ * incorporated in DomainLifetimeWorkload.
*/
static void
-launchGpuEndOfStepTasks(nonbonded_verlet_t *nbv,
- gmx::GpuBonded *gpuBonded,
- gmx_pme_t *pmedata,
- gmx_enerdata_t *enerd,
- const gmx::MdScheduleWorkload &mdScheduleWork,
- bool useGpuNonbonded,
- bool useGpuPme,
- int64_t step,
- gmx_wallcycle_t wcycle)
+launchGpuEndOfStepTasks(nonbonded_verlet_t *nbv,
+ gmx::GpuBonded *gpuBonded,
+ gmx_pme_t *pmedata,
+ gmx_enerdata_t *enerd,
+ const gmx::MdrunScheduleWorkload &runScheduleWork,
+ bool useGpuNonbonded,
+ bool useGpuPme,
+ int64_t step,
+ gmx_wallcycle_t wcycle)
{
if (useGpuNonbonded)
{
/* now clear the GPU outputs while we finish the step on the CPU */
wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, mdScheduleWork.forceFlags.computeVirial);
+ Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
pme_gpu_reinit_computation(pmedata, wcycle);
}
- if (mdScheduleWork.forceWork.haveGpuBondedWork && mdScheduleWork.forceFlags.computeEnergy)
+ if (runScheduleWork.domainWork.haveGpuBondedWork && runScheduleWork.stepWork.computeEnergy)
{
// in principle this should be included in the DD balancing region,
// but generally it is infrequent so we'll omit it for the sake of
gmx::ArrayRef<real> lambda,
t_graph *graph,
t_forcerec *fr,
- gmx::MdScheduleWorkload *mdScheduleWork,
+ gmx::MdrunScheduleWorkload *runScheduleWork,
const gmx_vsite_t *vsite,
rvec mu_tot,
double t,
{
legacyFlags &= ~GMX_FORCE_NONBONDED;
}
- setupForceFlags(&mdScheduleWork->forceFlags, legacyFlags, fr->bNonbonded);
+ setupStepWorkload(&runScheduleWork->stepWork, legacyFlags, fr->bNonbonded);
- const gmx::ForceFlags &forceFlags = mdScheduleWork->forceFlags;
+ const gmx::StepWorkload &stepWork = runScheduleWork->stepWork;
- bFillGrid = (forceFlags.doNeighborSearch && forceFlags.stateChanged);
+ bFillGrid = (stepWork.doNeighborSearch && stepWork.stateChanged);
bCalcCGCM = (bFillGrid && !DOMAINDECOMP(cr));
bUseGPU = fr->nbv->useGpu();
bUseOrEmulGPU = bUseGPU || fr->nbv->emulateGpu();
const bool useGpuPme = EEL_PME(fr->ic->eeltype) && thisRankHasDuty(cr, DUTY_PME) &&
((pmeRunMode == PmeRunMode::GPU) || (pmeRunMode == PmeRunMode::Mixed));
const int pmeFlags = GMX_PME_SPREAD | GMX_PME_SOLVE |
- (forceFlags.computeVirial ? GMX_PME_CALC_ENER_VIR : 0) |
- (forceFlags.computeEnergy ? GMX_PME_CALC_ENER_VIR : 0) |
- (forceFlags.computeForces ? GMX_PME_CALC_F : 0);
+ (stepWork.computeVirial ? GMX_PME_CALC_ENER_VIR : 0) |
+ (stepWork.computeEnergy ? GMX_PME_CALC_ENER_VIR : 0) |
+ (stepWork.computeForces ? GMX_PME_CALC_F : 0);
// Switches on whether to use GPU for position and force buffer operations
// TODO consider all possible combinations of triggers, and how to combine optimally in each case.
BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
// GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
const BufferOpsUseGpu useGpuFBufOps = (c_enableGpuBufOps && bUseGPU && (GMX_GPU == GMX_GPU_CUDA))
- && !(forceFlags.computeVirial || forceFlags.computeEnergy) ?
+ && !(stepWork.computeVirial || stepWork.computeEnergy) ?
BufferOpsUseGpu::True : BufferOpsUseGpu::False;
// TODO: move / add this flag to the internal PME GPU data structures
const bool useGpuPmeFReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
* somewhere early inside the step after communication during domain
* decomposition (and not during the previous step as usual).
*/
- if (forceFlags.doNeighborSearch)
+ if (stepWork.doNeighborSearch)
{
ddBalanceRegionHandler.openBeforeForceComputationCpu(DdAllowBalanceRegionReopen::yes);
}
clear_mat(vir_force);
- if (forceFlags.stateChanged)
+ if (stepWork.stateChanged)
{
if (inputrecNeedMutot(inputrec))
{
/* Compute shift vectors every step,
* because of pressure coupling or box deformation!
*/
- if (forceFlags.haveDynamicBox && forceFlags.stateChanged)
+ if (stepWork.haveDynamicBox && stepWork.stateChanged)
{
calc_shifts(box, fr->shift_vec);
}
}
}
- nbnxn_atomdata_copy_shiftvec(forceFlags.haveDynamicBox,
+ nbnxn_atomdata_copy_shiftvec(stepWork.haveDynamicBox,
fr->shift_vec, nbv->nbat.get());
#if GMX_MPI
*/
gmx_pme_send_coordinates(cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
lambda[efptCOUL], lambda[efptVDW],
- (forceFlags.computeVirial || forceFlags.computeEnergy),
+ (stepWork.computeVirial || stepWork.computeEnergy),
step, wcycle);
}
#endif /* GMX_MPI */
if (useGpuPme)
{
- launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), forceFlags, pmeFlags, useGpuPmeFReduction, wcycle);
+ launchPmeGpuSpread(fr->pmedata, box, as_rvec_array(x.unpaddedArrayRef().data()), stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
}
/* do gridding for pair search */
- if (forceFlags.doNeighborSearch)
+ if (stepWork.doNeighborSearch)
{
- if (graph && forceFlags.stateChanged)
+ if (graph && stepWork.stateChanged)
{
/* Calculate intramolecular shift vectors to make molecules whole */
mk_mshift(fplog, graph, fr->ePBC, box, as_rvec_array(x.unpaddedArrayRef().data()));
}
}
- // Call it per-step as force-flags can change.
- // Need to run after the GPU-offload bonded interaction lists
- // are set up to be able to determine whether there is bonded work.
- setupForceWorkload(&mdScheduleWork->forceWork,
- inputrec,
- fr,
- pull_work,
- ed,
- top->idef,
- fcd,
- forceFlags);
+ if (stepWork.doNeighborSearch)
+ {
+ // Need to run after the GPU-offload bonded interaction lists
+ // are set up to be able to determine whether there is bonded work.
+ setupDomainLifetimeWorkload(&runScheduleWork->domainWork,
+ inputrec,
+ fr,
+ pull_work,
+ ed,
+ top->idef,
+ fcd,
+ stepWork);
+ }
- const gmx::PpForceWorkload &forceWork = mdScheduleWork->forceWork;
+ const gmx::DomainLifetimeWorkload &domainWork = runScheduleWork->domainWork;
/* do local pair search */
- if (forceFlags.doNeighborSearch)
+ if (stepWork.doNeighborSearch)
{
- // TODO: fuse this branch with the above forceFlags.doNeighborSearch block
+ // TODO: fuse this branch with the above stepWork.doNeighborSearch block
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
/* Note that with a GPU the launch overhead of the list transfer is not timed separately */
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
- if (forceFlags.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+ if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
{
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
Nbnxm::AtomLocality::Local);
// bonded work not split into separate local and non-local, so with DD
// we can only launch the kernel after non-local coordinates have been received.
- if (forceWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
+ if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
- fr->gpuBonded->launchKernel(fr, forceFlags, box);
+ fr->gpuBonded->launchKernel(fr, stepWork, box);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
}
/* launch local nonbonded work on GPU */
wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::Local, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
do non-local pair search */
if (havePPDomainDecomposition(cr))
{
- if (forceFlags.doNeighborSearch)
+ if (stepWork.doNeighborSearch)
{
- // TODO: fuse this branch with the above large forceFlags.doNeighborSearch block
+ // TODO: fuse this branch with the above large stepWork.doNeighborSearch block
wallcycle_start_nocount(wcycle, ewcNS);
wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
/* Note that with a GPU the launch overhead of the list transfer is not timed separately */
gpuHaloExchange->communicateHaloCoordinates(box);
// TODO Force flags should include haveFreeEnergyWork for this domain
- if (forceWork.haveCpuBondedWork || (fr->efep != efepNO))
+ if (domainWork.haveCpuBondedWork || (fr->efep != efepNO))
{
//non-local part of coordinate buffer must be copied back to host for CPU work
nbv->launch_copy_x_from_gpu(as_rvec_array(x.unpaddedArrayRef().data()), Nbnxm::AtomLocality::NonLocal);
{
wallcycle_start(wcycle, ewcLAUNCH_GPU);
- if (forceFlags.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
+ if (stepWork.doNeighborSearch || (useGpuXBufOps == BufferOpsUseGpu::False))
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(),
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
}
- if (forceWork.haveGpuBondedWork)
+ if (domainWork.haveGpuBondedWork)
{
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
- fr->gpuBonded->launchKernel(fr, forceFlags, box);
+ fr->gpuBonded->launchKernel(fr, stepWork, box);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
}
/* launch non-local nonbonded tasks on GPU */
wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
if (havePPDomainDecomposition(cr))
{
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- forceFlags, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
+ stepWork, Nbnxm::AtomLocality::NonLocal, copyBackNbForce);
}
Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(),
- forceFlags, Nbnxm::AtomLocality::Local, copyBackNbForce);
+ stepWork, Nbnxm::AtomLocality::Local, copyBackNbForce);
wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
- if (forceWork.haveGpuBondedWork && forceFlags.computeEnergy)
+ if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
{
fr->gpuBonded->launchEnergyTransfer();
}
wallcycle_stop(wcycle, ewcLAUNCH_GPU);
}
- if (forceFlags.stateChanged && inputrecNeedMutot(inputrec))
+ if (stepWork.stateChanged && inputrecNeedMutot(inputrec))
{
if (PAR(cr))
{
if (inputrec->bRot)
{
wallcycle_start(wcycle, ewcROT);
- do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, forceFlags.doNeighborSearch);
+ do_rotation(cr, enforcedRotation, box, as_rvec_array(x.unpaddedArrayRef().data()), t, step, stepWork.doNeighborSearch);
wallcycle_stop(wcycle, ewcROT);
}
// Set up and clear force outputs.
// We use std::move to keep the compiler happy, it has no effect.
- ForceOutputs forceOut = setupForceOutputs(fr, pull_work, *inputrec, std::move(force), forceFlags, wcycle);
+ ForceOutputs forceOut = setupForceOutputs(fr, pull_work, *inputrec, std::move(force), stepWork, wcycle);
/* We calculate the non-bonded forces, when done on the CPU, here.
* We do this before calling do_force_lowlevel, because in that
if (!bUseOrEmulGPU)
{
- do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::Local, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::Local, enbvClearFYes,
step, nrnb, wcycle);
}
nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::Local,
fr, as_rvec_array(x.unpaddedArrayRef().data()), &forceOut.forceWithShiftForces(), *mdatoms,
inputrec->fepvals, lambda.data(),
- enerd, forceFlags, nrnb);
+ enerd, stepWork, nrnb);
if (havePPDomainDecomposition(cr))
{
nbv->dispatchFreeEnergyKernel(Nbnxm::InteractionLocality::NonLocal,
fr, as_rvec_array(x.unpaddedArrayRef().data()), &forceOut.forceWithShiftForces(), *mdatoms,
inputrec->fepvals, lambda.data(),
- enerd, forceFlags, nrnb);
+ enerd, stepWork, nrnb);
}
}
{
if (havePPDomainDecomposition(cr))
{
- do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
+ do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::NonLocal, enbvClearFNo,
step, nrnb, wcycle);
}
- if (forceFlags.computeForces)
+ if (stepWork.computeForces)
{
/* Add all the non-bonded force to the normal force array.
* This can be split into a local and a non-local part when overlapping
}
/* If there are multiple fshift output buffers we need to reduce them */
- if (forceFlags.computeVirial)
+ if (stepWork.computeVirial)
{
/* This is not in a subcounter because it takes a
negligible and constant-sized amount of time */
// TODO Force flags should include haveFreeEnergyWork for this domain
if (ddUsesGpuDirectCommunication &&
- (forceWork.haveCpuBondedWork || (fr->efep != efepNO)))
+ (domainWork.haveCpuBondedWork || (fr->efep != efepNO)))
{
/* Wait for non-local coordinate data to be copied from device */
nbv->wait_nonlocal_x_copy_D2H_done();
cr, ms, nrnb, wcycle, mdatoms,
x, hist, &forceOut, enerd, fcd,
box, lambda.data(), graph, fr->mu_tot,
- forceFlags,
+ stepWork,
ddBalanceRegionHandler);
wallcycle_stop(wcycle, ewcFORCE);
computeSpecialForces(fplog, cr, inputrec, awh, enforcedRotation,
imdSession, pull_work, step, t, wcycle,
fr->forceProviders, box, x.unpaddedArrayRef(), mdatoms, lambda.data(),
- forceFlags, &forceOut.forceWithVirial(), enerd,
- ed, forceFlags.doNeighborSearch);
+ stepWork, &forceOut.forceWithVirial(), enerd,
+ ed, stepWork.doNeighborSearch);
+
// Will store the amount of cycles spent waiting for the GPU that
// will be later used in the DLB accounting.
if (bUseGPU)
{
cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
- forceFlags, Nbnxm::AtomLocality::NonLocal,
+ stepWork, Nbnxm::AtomLocality::NonLocal,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
forceWithShiftForces.shiftForces(),
else
{
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
+ do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::NonLocal, enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
}
// TODO: move this into DomainLifetimeWorkload, including the second part of the condition
// The bonded and free energy CPU tasks can have non-local force contributions
// which are a dependency for the GPU force reduction.
- bool haveNonLocalForceContribInCpuBuffer = forceWork.haveCpuBondedWork || (fr->efep != efepNO);
+ bool haveNonLocalForceContribInCpuBuffer = domainWork.haveCpuBondedWork || (fr->efep != efepNO);
rvec *f = as_rvec_array(forceWithShiftForces.force().data());
if (haveNonLocalForceContribInCpuBuffer)
}
- if (fr->nbv->emulateGpu() && forceFlags.computeVirial)
+ if (fr->nbv->emulateGpu() && stepWork.computeVirial)
{
nbnxn_atomdata_add_nbat_fshift_to_fshift(*nbv->nbat,
forceWithShiftForces.shiftForces());
const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
const bool useCpuPmeFReduction = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
// TODO: move this into DomainLifetimeWorkload, including the second part of the condition
- const bool haveCpuLocalForces = (forceWork.haveSpecialForces || forceWork.haveCpuListedForceWork || useCpuPmeFReduction ||
+ const bool haveCpuLocalForces = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction ||
(fr->efep != efepNO));
if (havePPDomainDecomposition(cr))
*/
ddBalanceRegionHandler.closeAfterForceComputationCpu();
- if (forceFlags.computeForces)
+ if (stepWork.computeForces)
{
gmx::ArrayRef<gmx::RVec> force = forceOut.forceWithShiftForces().force();
rvec *f = as_rvec_array(force.data());
if (alternateGpuWait)
{
alternatePmeNbGpuWaitReduce(fr->nbv.get(), fr->pmedata, &forceOut, enerd,
- forceFlags, pmeFlags, wcycle);
+ stepWork, pmeFlags, wcycle);
}
if (!alternateGpuWait && useGpuPme)
const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
const float waitCycles =
Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
- forceFlags, Nbnxm::AtomLocality::Local,
+ stepWork, Nbnxm::AtomLocality::Local,
enerd->grpp.ener[egLJSR].data(),
enerd->grpp.ener[egCOULSR].data(),
forceOut.forceWithShiftForces().shiftForces(),
if (ddBalanceRegionHandler.useBalancingRegion())
{
DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
- if (forceFlags.computeForces && waitCycles <= gpuWaitApiOverheadMargin)
+ if (stepWork.computeForces && waitCycles <= gpuWaitApiOverheadMargin)
{
/* We measured few cycles, it could be that the kernel
* and transfer finished earlier and there was no actual
// NOTE: emulation kernel is not included in the balancing region,
// but emulation mode does not target performance anyway
wallcycle_start_nocount(wcycle, ewcFORCE);
- do_nb_verlet(fr, ic, enerd, forceFlags, Nbnxm::InteractionLocality::Local,
+ do_nb_verlet(fr, ic, enerd, stepWork, Nbnxm::InteractionLocality::Local,
DOMAINDECOMP(cr) ? enbvClearFNo : enbvClearFYes,
step, nrnb, wcycle);
wallcycle_stop(wcycle, ewcFORCE);
}
launchGpuEndOfStepTasks(nbv, fr->gpuBonded, fr->pmedata, enerd,
- *mdScheduleWork,
+ *runScheduleWork,
bUseGPU, useGpuPme,
step,
wcycle);
dd_force_flop_stop(cr->dd, nrnb);
}
- if (forceFlags.computeForces)
+ if (stepWork.computeForces)
{
rvec *f = as_rvec_array(forceOut.forceWithShiftForces().force().data());
/* If we have NoVirSum forces, but we do not calculate the virial,
* we sum fr->f_novirsum=forceOut.f later.
*/
- if (vsite && !(fr->haveDirectVirialContributions && !forceFlags.computeVirial))
+ if (vsite && !(fr->haveDirectVirialContributions && !stepWork.computeVirial))
{
rvec *fshift = as_rvec_array(forceOut.forceWithShiftForces().shiftForces().data());
spread_vsite_f(vsite, as_rvec_array(x.unpaddedArrayRef().data()), f, fshift, FALSE, nullptr, nrnb,
&top->idef, fr->ePBC, fr->bMolPBC, graph, box, cr, wcycle);
}
- if (forceFlags.computeVirial)
+ if (stepWork.computeVirial)
{
/* Calculation of the virial must be done after vsites! */
calc_virial(0, mdatoms->homenr, as_rvec_array(x.unpaddedArrayRef().data()),
pme_receive_force_ener(cr, &forceOut.forceWithVirial(), enerd, wcycle);
}
- if (forceFlags.computeForces)
+ if (stepWork.computeForces)
{
post_process_forces(cr, step, nrnb, wcycle,
top, box, as_rvec_array(x.unpaddedArrayRef().data()), &forceOut,
vir_force, mdatoms, graph, fr, vsite,
- forceFlags);
+ stepWork);
}
- if (forceFlags.computeEnergy)
+ if (stepWork.computeEnergy)
{
/* Sum the potential energy terms from group contributions */
sum_epot(&(enerd->grpp), enerd->term);