#include "gromacs/mdtypes/mdatom.h"
#include "gromacs/mdtypes/mdrunoptions.h"
#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/observablesreducer.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/mdtypes/state.h"
#include "gromacs/mdtypes/state_propagator_data_gpu.h"
{
DevelopmentFeatureFlags devFlags;
- devFlags.enableGpuBufferOps =
- GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+ devFlags.enableGpuBufferOps = (GMX_GPU_CUDA || GMX_GPU_SYCL) && useGpuForNonbonded
+ && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
devFlags.enableGpuHaloExchange = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
devFlags.enableGpuPmePPComm = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
print_flop(fplog, nrnb_tot, &nbfs, &mflop);
}
- if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
+ if (thisRankHasDuty(cr, DUTY_PP) && haveDDAtomOrdering(*cr))
{
print_dd_statistics(cr, inputrec, fplog);
}
hw_opt.nthreads_tmpi);
useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(useGpuForNonbonded,
pmeTarget,
+ pmeFftTarget,
numAvailableDevices,
userGpuTaskAssignment,
*hwinfo_,
// master and spawned threads joins at the end of this block.
}
- GMX_RELEASE_ASSERT(ms || simulationCommunicator != MPI_COMM_NULL,
+ GMX_RELEASE_ASSERT(!GMX_MPI || ms || simulationCommunicator != MPI_COMM_NULL,
"Must have valid communicator unless running a multi-simulation");
CommrecHandle crHandle = init_commrec(simulationCommunicator);
t_commrec* cr = crHandle.get();
GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
partialDeserializedTpr.reset(nullptr);
- GMX_RELEASE_ASSERT(
- !inputrec->useConstantAcceleration,
- "Linear acceleration has been removed in GROMACS 2022, and was broken for many years "
- "before that. Use GROMACS 4.5 or earlier if you need this feature.");
-
// Now the number of ranks is known to all ranks, and each knows
// the inputrec read by the master rank. The ranks can now all run
// the task-deciding functions and will agree on the result
// without needing to communicate.
+ // The LBFGS minimizer, test-particle insertion, normal modes and shell dynamics don't support DD
const bool useDomainDecomposition =
- (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == IntegrationAlgorithm::NM));
+ !(inputrec->eI == IntegrationAlgorithm::LBFGS || EI_TPI(inputrec->eI)
+ || inputrec->eI == IntegrationAlgorithm::NM
+ || gmx_mtop_particletype_count(mtop)[ParticleType::Shell] > 0);
// Note that these variables describe only their own node.
//
gpusWereDetected);
useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded,
pmeTarget,
+ pmeFftTarget,
userGpuTaskAssignment,
*hwinfo_,
*inputrec,
doEssentialDynamics,
membedHolder.doMembed());
+ ObservablesReducerBuilder observablesReducerBuilder;
+
// Build restraints.
// TODO: hide restraint implementation details from Mdrunner.
// There is nothing unique about restraints at this point as far as the
globalState = std::make_unique<t_state>();
}
broadcastStateWithoutDynamics(
- cr->mpiDefaultCommunicator, DOMAINDECOMP(cr), PAR(cr), globalState.get());
+ cr->mpiDefaultCommunicator, haveDDAtomOrdering(*cr), PAR(cr), globalState.get());
}
/* A parallel command line option consistency check that we can
globalState.get(),
replExParams.exchangeInterval > 0);
- std::unique_ptr<t_oriresdata> oriresData;
- if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
+ if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0 && isSimulationMasterRank)
{
- oriresData = std::make_unique<t_oriresdata>(fplog, mtop, *inputrec, cr, ms, globalState.get());
+ extendStateWithOriresHistory(mtop, *inputrec, globalState.get());
}
auto deform = prepareBoxDeformation(globalState != nullptr ? globalState->box : box,
systemHasConstraintsOrVsites(mtop),
cutoffMargin);
+ try
+ {
+ const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
+
+ useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
+ updateGroups.useUpdateGroups(),
+ pmeRunMode,
+ domdecOptions.numPmeRanks > 0,
+ useGpuForNonbonded,
+ updateTarget,
+ gpusWereDetected,
+ *inputrec,
+ mtop,
+ doEssentialDynamics,
+ gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+ haveFrozenAtoms,
+ doRerun,
+ devFlags,
+ mdlog);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+
+ bool useGpuDirectHalo = false;
+
+ if (useGpuForNonbonded)
+ {
+ // cr->npmenodes is not yet initialized.
+ // domdecOptions.numPmeRanks == -1 results in 0 separate PME ranks when useGpuForNonbonded is true.
+ // Todo: remove this assumption later once auto mode has support for separate PME rank
+ const int numPmeRanks = domdecOptions.numPmeRanks > 0 ? domdecOptions.numPmeRanks : 0;
+ bool havePPDomainDecomposition = (cr->sizeOfDefaultCommunicator - numPmeRanks) > 1;
+ useGpuDirectHalo = decideWhetherToUseGpuForHalo(devFlags,
+ havePPDomainDecomposition,
+ useGpuForNonbonded,
+ useModularSimulator,
+ doRerun,
+ EI_ENERGY_MINIMIZATION(inputrec->eI));
+ }
+
// This builder is necessary while we have multi-part construction
// of DD. Before DD is constructed, we use the existence of
// the builder object to indicate that further construction of DD
std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
if (useDomainDecomposition)
{
- ddBuilder = std::make_unique<DomainDecompositionBuilder>(
+ // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple
+ // timesteps. DLB needs to be disabled in that case
+ const bool directGpuCommUsedWithGpuUpdate = GMX_THREAD_MPI && useGpuDirectHalo && useGpuForUpdate;
+ ddBuilder = std::make_unique<DomainDecompositionBuilder>(
mdlog,
cr,
domdecOptions,
updateGroups.maxUpdateGroupRadius(),
positionsFromStatePointer(globalState.get()),
useGpuForNonbonded,
- useGpuForPme);
+ useGpuForPme,
+ directGpuCommUsedWithGpuUpdate);
}
else
{
// requires it (e.g. pull, CompEl, density fitting), so that we
// don't update the local atom sets unilaterally every step.
LocalAtomSetManager atomSets;
+
+ // Local state and topology are declared (and perhaps constructed)
+ // now, because DD needs them for the LocalTopologyChecker, but
+ // they do not contain valid data until after the first DD
+ // partition.
+ std::unique_ptr<t_state> localStateInstance;
+ t_state* localState;
+ gmx_localtop_t localTopology(mtop.ffparams);
+
if (ddBuilder)
{
+ localStateInstance = std::make_unique<t_state>();
+ localState = localStateInstance.get();
// TODO Pass the GPU streams to ddBuilder to use in buffer
// transfers (e.g. halo exchange)
- cr->dd = ddBuilder->build(&atomSets);
+ cr->dd = ddBuilder->build(&atomSets, localTopology, *localState, &observablesReducerBuilder);
// The builder's job is done, so destruct it
ddBuilder.reset(nullptr);
// Note that local state still does not exist yet.
}
-
- // The GPU update is decided here because we need to know whether the constraints or
- // SETTLEs can span across the domain borders (i.e. whether or not update groups are
- // defined). This is only known after DD is initialized, hence decision on using GPU
- // update is done so late.
- try
+ else
{
- const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
+ // Without DD, the local state is merely an alias to the global state,
+ // so we don't need to allocate anything.
+ localState = globalState.get();
+ }
- useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
- updateGroups.useUpdateGroups(),
- pmeRunMode,
- domdecOptions.numPmeRanks > 0,
- useGpuForNonbonded,
- updateTarget,
- gpusWereDetected,
- *inputrec,
- mtop,
- doEssentialDynamics,
- gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
- haveFrozenAtoms,
- doRerun,
- devFlags,
- mdlog);
+ // Ensure that all atoms within the same update group are in the
+ // same periodic image. Otherwise, a simulation that did not use
+ // update groups (e.g. a single-rank simulation) cannot always be
+ // correctly restarted in a way that does use update groups
+ // (e.g. a multi-rank simulation).
+ if (isSimulationMasterRank)
+ {
+ const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
+ if (useUpdateGroups)
+ {
+ putUpdateGroupAtomsInSamePeriodicImage(*cr->dd, mtop, globalState->box, globalState->x);
+ }
}
- GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
const bool printHostName = (cr->nnodes > 1);
gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
MdrunScheduleWorkload runScheduleWork;
- bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(devFlags,
- havePPDomainDecomposition(cr),
- useGpuForNonbonded,
- useModularSimulator,
- doRerun,
- EI_ENERGY_MINIMIZATION(inputrec->eI));
-
// Also populates the simulation constant workload description.
+ // Note: currently the default duty is DUTY_PP | DUTY_PME for all simulations, including those without PME,
+ // so this boolean is sufficient on all ranks to determine whether separate PME ranks are used,
+ // but this will no longer be the case if cr->duty is changed for !EEL_PME(fr->ic->eeltype).
+ const bool haveSeparatePmeRank = (!thisRankHasDuty(cr, DUTY_PP) || !thisRankHasDuty(cr, DUTY_PME));
runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
disableNonbondedCalculation,
devFlags,
havePPDomainDecomposition(cr),
+ haveSeparatePmeRank,
useGpuForNonbonded,
pmeRunMode,
useGpuForBonded,
if (deviceInfo != nullptr)
{
- if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
+ if (runScheduleWork.simulationWork.havePpDomainDecomposition && thisRankHasDuty(cr, DUTY_PP))
{
dd_setup_dlb_resource_sharing(cr, deviceId);
}
// Enable Peer access between GPUs where available
// Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
// any of the GPU communication features are active.
- if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
+ if (haveDDAtomOrdering(*cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
&& (runScheduleWork.simulationWork.useGpuHaloExchange
|| runScheduleWork.simulationWork.useGpuPmePpCommunication))
{
pforce);
// Dirty hack, for fixing disres and orires should be made mdmodules
fr->fcdata->disres = disresdata;
- fr->fcdata->orires.swap(oriresData);
+ if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
+ {
+ fr->fcdata->orires = std::make_unique<t_oriresdata>(
+ fplog, mtop, *inputrec, ms, globalState.get(), &atomSets);
+ }
// Save a handle to device stream manager to use elsewhere in the code
// TODO: Forcerec is not a correct place to store it.
deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)),
wcycle.get());
}
+ fr->longRangeNonbondeds = std::make_unique<CpuPpLongRangeNonbondeds>(fr->n_tpi,
+ fr->ic->ewaldcoeff_q,
+ fr->ic->epsilon_r,
+ fr->qsum,
+ fr->ic->eeltype,
+ fr->ic->vdwtype,
+ *inputrec,
+ &nrnb,
+ wcycle.get(),
+ fplog);
/* Initialize the mdAtoms structure.
* mdAtoms is not filled with atom data,
}
}
// Make the DD reverse topology, now that any vsites that are present are available
- if (DOMAINDECOMP(cr))
+ if (haveDDAtomOrdering(*cr))
{
dd_make_reverse_top(fplog, cr->dd, mtop, vsite.get(), *inputrec, domdecOptions.ddBondedChecking);
}
ms,
&nrnb,
wcycle.get(),
- fr->bMolPBC);
+ fr->bMolPBC,
+ &observablesReducerBuilder);
/* Energy terms and groups */
gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
mdrunOptions.imdOptions,
startingBehavior);
- if (DOMAINDECOMP(cr))
+ if (haveDDAtomOrdering(*cr))
{
GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
/* This call is not included in init_domain_decomposition
GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
SimulatorBuilder simulatorBuilder;
- simulatorBuilder.add(SimulatorStateData(globalState.get(), &observablesHistory, &enerd, &ekind));
+ simulatorBuilder.add(SimulatorStateData(
+ globalState.get(), localState, &observablesHistory, &enerd, &ekind));
simulatorBuilder.add(std::move(membedHolder));
simulatorBuilder.add(std::move(stopHandlerBuilder_));
simulatorBuilder.add(SimulatorConfig(mdrunOptions, startingBehavior, &runScheduleWork));
- simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv));
+ simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv, &observablesReducerBuilder));
simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle.get()));
simulatorBuilder.add(ConstraintsParam(
constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, vsite.get()));
simulatorBuilder.add(CenterOfMassPulling(pull_work));
// Todo move to an MDModule
simulatorBuilder.add(IonSwapping(swap));
- simulatorBuilder.add(TopologyData(mtop, mdAtoms.get()));
+ simulatorBuilder.add(TopologyData(mtop, &localTopology, mdAtoms.get()));
simulatorBuilder.add(BoxDeformationHandle(deform.get()));
simulatorBuilder.add(std::move(modularSimulatorCheckpointData));
// As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
mdAtoms.reset(nullptr);
globalState.reset(nullptr);
+ localStateInstance.reset(nullptr);
mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
fr.reset(nullptr); // destruct forcerec before gpu
// TODO convert to C++ so we can get rid of these frees