top_global,
constr ? constr->numFlexibleConstraints() : 0,
ir->nstcalcenergy,
- DOMAINDECOMP(cr),
+ haveDDAtomOrdering(*cr),
useGpuForPme);
{
? PinningPolicy::PinnedIfSupported
: PinningPolicy::CannotBePinned);
const t_mdatoms* md = mdAtoms->mdatoms();
- if (DOMAINDECOMP(cr))
+ if (haveDDAtomOrdering(*cr))
{
// Local state only becomes valid now.
dd_init_local_state(*cr->dd, state_global, state);
// TODO: the assertions below should be handled by UpdateConstraintsBuilder.
if (useGpuForUpdate)
{
- GMX_RELEASE_ASSERT(!DOMAINDECOMP(cr) || ddUsesUpdateGroups(*cr->dd) || constr == nullptr
- || constr->numConstraintsTotal() == 0,
+ GMX_RELEASE_ASSERT(!haveDDAtomOrdering(*cr) || ddUsesUpdateGroups(*cr->dd)
+ || constr == nullptr || constr->numConstraintsTotal() == 0,
"Constraints in domain decomposition are only supported with update "
"groups if using GPU update.\n");
GMX_RELEASE_ASSERT(ir->eConstrAlg != ConstraintAlgorithm::Shake || constr == nullptr
shake_vir,
total_vir,
pres,
- gmx::ArrayRef<real>{},
&nullSignaller,
state->box,
&bSumEkinhOld,
cglo_flags_iteration,
step,
&observablesReducer);
+ // Clean up after pre-step use of compute_globals()
+ observablesReducer.markAsReadyToReduce();
+
if (cglo_flags_iteration & CGLO_STOPCM)
{
/* At initialization, do not pass x with acceleration-correction mode
shake_vir,
total_vir,
pres,
- gmx::ArrayRef<real>{},
&nullSignaller,
state->box,
&bSumEkinhOld,
cglo_flags & ~CGLO_PRESSURE,
step,
&observablesReducer);
+ // Clean up after pre-step use of compute_globals()
+ observablesReducer.markAsReadyToReduce();
}
/* Calculate the initial half step temperature, and save the ekinh_old */
do_verbose = mdrunOptions.verbose
&& (step % mdrunOptions.verboseStepPrintInterval == 0 || bFirstStep || bLastStep);
- if (useGpuForUpdate && !bFirstStep && bNS)
+ // On search steps, when doing the update on the GPU, copy
+ // the coordinates and velocities to the host unless they are
+ // already there (ie on the first step and after replica
+ // exchange).
+ if (useGpuForUpdate && bNS && !bFirstStep && !bExchanged)
{
- // Copy velocities from the GPU on search steps to keep a copy on host (device buffers are reinitialized).
stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
- stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
- // Copy coordinate from the GPU when needed at the search step.
- // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
- // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
+ stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
}
if (correct_box(fplog, step, state->box))
{
bMasterState = TRUE;
- // If update is offloaded, it should be informed about the box size change
- if (useGpuForUpdate)
- {
- integrator->setPbc(PbcType::Xyz, state->box);
- }
}
}
- if (DOMAINDECOMP(cr) && bMasterState)
+ // If update is offloaded, and the box was changed either
+ // above or in a replica exchange on the previous step,
+ // the GPU Update object should be informed
+ if (useGpuForUpdate && (bMasterState || bExchanged))
+ {
+ integrator->setPbc(PbcType::Xyz, state->box);
+ }
+ if (haveDDAtomOrdering(*cr) && bMasterState)
{
dd_collect_state(cr->dd, state, state_global);
}
- if (DOMAINDECOMP(cr))
+ if (haveDDAtomOrdering(*cr))
{
/* Repartition the domain decomposition */
dd_partition_system(fplog,
nullptr,
nullptr,
nullptr,
- gmx::ArrayRef<real>{},
&nullSignaller,
state->box,
&bSumEkinhOld,
{
if (useGpuForUpdate)
{
- if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
+ // On search steps, update handles to device vectors
+ if (bNS && (bFirstStep || haveDDAtomOrdering(*cr) || bExchanged))
{
integrator->set(stateGpu->getCoordinates(),
stateGpu->getVelocities(),
/* The velocity copy is redundant if we had Center-of-Mass motion removed on
* the previous step. We don't check that now. */
stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
- if (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
- && !runScheduleWork->stepWork.useGpuXBufferOps)
+ if (bExchanged
+ || (!runScheduleWork->stepWork.haveGpuPmeOnThisRank
+ && !runScheduleWork->stepWork.useGpuXBufferOps))
{
stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
}
&& do_per_step(step + ir->nsttcouple - 1, ir->nsttcouple));
// This applies Leap-Frog, LINCS and SETTLE in succession
- integrator->integrate(
- stateGpu->getForcesReadyOnDeviceEvent(
- AtomLocality::Local, runScheduleWork->stepWork.useGpuFBufferOps),
- ir->delta_t,
- true,
- bCalcVir,
- shake_vir,
- doTemperatureScaling,
- ekind->tcstat,
- doParrinelloRahman,
- ir->nstpcouple * ir->delta_t,
- M);
-
- // Copy velocities D2H after update if:
- // - Globals are computed this step (includes the energy output steps).
- // - Temperature is needed for the next step.
- if (bGStat || needHalfStepKineticEnergy)
- {
- stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
- stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
- }
+ integrator->integrate(stateGpu->getLocalForcesReadyOnDeviceEvent(
+ runScheduleWork->stepWork, runScheduleWork->simulationWork),
+ ir->delta_t,
+ true,
+ bCalcVir,
+ shake_vir,
+ doTemperatureScaling,
+ ekind->tcstat,
+ doParrinelloRahman,
+ ir->nstpcouple * ir->delta_t,
+ M);
}
else
{
if (ir->bPull && ir->pull->bSetPbcRefToPrevStepCOM)
{
- updatePrevStepPullCom(pull_work, state);
+ updatePrevStepPullCom(pull_work, state->pull_com_prev_step);
}
enerd->term[F_DVDL_CONSTR] += dvdl_constr;
// and when algorithms require it.
const bool doInterSimSignal = (simulationsShareState && do_per_step(step, nstSignalComm));
- if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+ if (useGpuForUpdate)
{
- // Copy coordinates when needed to stop the CM motion.
- if (useGpuForUpdate && (bDoReplEx || (!EI_VV(ir->eI) && bStopCM)))
+ const bool coordinatesRequiredForStopCM =
+ bStopCM && (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+ && !EI_VV(ir->eI);
+
+ // Copy coordinates when needed to stop the CM motion or for replica exchange
+ if (coordinatesRequiredForStopCM || bDoReplEx)
{
stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
}
+
+ // Copy velocities back to the host if:
+ // - Globals are computed this step (includes the energy output steps).
+ // - Temperature is needed for the next step.
+ // - This is a replica exchange step (even though we will only need
+ // the velocities if an exchange succeeds)
+ if (bGStat || needHalfStepKineticEnergy || bDoReplEx)
+ {
+ stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
+ stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
+ }
+ }
+
+ if (bGStat || needHalfStepKineticEnergy || doInterSimSignal)
+ {
// Since we're already communicating at this step, we
// can propagate intra-simulation signals. Note that
// check_nstglobalcomm has the responsibility for
bool doIntraSimSignal = true;
SimulationSignaller signaller(&signals, cr, ms, doInterSimSignal, doIntraSimSignal);
- compute_globals(
- gstat,
- cr,
- ir,
- fr,
- ekind,
- makeConstArrayRef(state->x),
- makeConstArrayRef(state->v),
- state->box,
- md,
- nrnb,
- &vcm,
- wcycle,
- enerd,
- force_vir,
- shake_vir,
- total_vir,
- pres,
- (!EI_VV(ir->eI) && bCalcEner && constr != nullptr) ? constr->rmsdData()
- : gmx::ArrayRef<real>{},
- &signaller,
- lastbox,
- &bSumEkinhOld,
- (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
- | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
- | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
- | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
- step,
- &observablesReducer);
+ compute_globals(gstat,
+ cr,
+ ir,
+ fr,
+ ekind,
+ makeConstArrayRef(state->x),
+ makeConstArrayRef(state->v),
+ state->box,
+ md,
+ nrnb,
+ &vcm,
+ wcycle,
+ enerd,
+ force_vir,
+ shake_vir,
+ total_vir,
+ pres,
+ &signaller,
+ lastbox,
+ &bSumEkinhOld,
+ (bGStat ? CGLO_GSTAT : 0) | (!EI_VV(ir->eI) && bCalcEner ? CGLO_ENERGY : 0)
+ | (!EI_VV(ir->eI) && bStopCM ? CGLO_STOPCM : 0)
+ | (!EI_VV(ir->eI) ? CGLO_TEMPERATURE : 0)
+ | (!EI_VV(ir->eI) ? CGLO_PRESSURE : 0) | CGLO_CONSTRAINT,
+ step,
+ &observablesReducer);
if (!EI_VV(ir->eI) && bStopCM)
{
process_and_stopcm_grp(
// TODO: The special case of removing CM motion should be dealt more gracefully
if (useGpuForUpdate)
{
+ // Issue #3988, #4106.
+ stateGpu->resetCoordinatesCopiedToDeviceEvent(AtomLocality::Local);
stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
// Here we block until the H2D copy completes because event sync with the
// force kernels that use the coordinates on the next steps is not implemented
accumulateKineticLambdaComponents(enerd, state->lambda, *ir->fepvals);
}
+ bool scaleCoordinates = !useGpuForUpdate || bDoReplEx;
update_pcouple_after_coordinates(fplog,
step,
ir,
state,
nrnb,
upd.deform(),
- !useGpuForUpdate);
+ scaleCoordinates);
const bool doBerendsenPressureCoupling = (inputrec->epc == PressureCoupling::Berendsen
&& do_per_step(step, inputrec->nstpcouple));
md->tmass,
enerd,
ir->fepvals.get(),
- ir->expandedvals.get(),
lastbox,
PTCouplingArrays{ state->boxv,
state->nosehoover_xi,
MASTER(cr) && mdrunOptions.verbose,
bRerunMD);
- if (bNeedRepartition && DOMAINDECOMP(cr))
+ if (bNeedRepartition && haveDDAtomOrdering(*cr))
{
dd_collect_state(cr->dd, state, state_global);
}
bExchanged = replica_exchange(fplog, cr, ms, repl_ex, state_global, enerd, state, step, t);
}
- if ((bExchanged || bNeedRepartition) && DOMAINDECOMP(cr))
+ if ((bExchanged || bNeedRepartition) && haveDDAtomOrdering(*cr))
{
dd_partition_system(fplog,
mdlog,
}
cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
- if (DOMAINDECOMP(cr) && wcycle)
+ if (haveDDAtomOrdering(*cr) && wcycle)
{
dd_cycles_add(cr->dd, cycles, ddCyclStep);
}
/* increase the MD step number */
step++;
step_rel++;
+ observablesReducer.markAsReadyToReduce();
#if GMX_FAHCORE
if (MASTER(cr))