#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/domdec/gpuhaloexchange.h"
#include "gromacs/domdec/localatomsetmanager.h"
+#include "gromacs/domdec/makebondedlinks.h"
#include "gromacs/domdec/partition.h"
+#include "gromacs/domdec/reversetopology.h"
#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/ewald/pme_gpu_program.h"
#include "gromacs/hardware/printhardware.h"
#include "gromacs/imd/imd.h"
#include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/listed_forces_gpu.h"
#include "gromacs/listed_forces/listed_forces.h"
#include "gromacs/listed_forces/orires.h"
#include "gromacs/math/functions.h"
#include "gromacs/mdtypes/mdatom.h"
#include "gromacs/mdtypes/mdrunoptions.h"
#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/observablesreducer.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/mdtypes/state.h"
#include "gromacs/mdtypes/state_propagator_data_gpu.h"
#include "gromacs/utility/keyvaluetree.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/loggerbuilder.h"
-#include "gromacs/utility/mdmodulenotification.h"
+#include "gromacs/utility/mdmodulesnotifiers.h"
#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/pleasecite.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/stringutil.h"
+#include "gromacs/utility/mpiinfo.h"
#include "isimulator.h"
#include "membedholder.h"
{
DevelopmentFeatureFlags devFlags;
- // Some builds of GCC 5 give false positive warnings that these
- // getenv results are ignored when clearly they are used.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-result"
-
devFlags.enableGpuBufferOps =
GMX_GPU_CUDA && useGpuForNonbonded && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
- devFlags.enableGpuHaloExchange = GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_DD_COMMS") != nullptr;
+ devFlags.enableGpuHaloExchange = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
- devFlags.enableGpuPmePPComm =
- GMX_GPU_CUDA && GMX_THREAD_MPI && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
+ devFlags.enableGpuPmePPComm = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
+
+ // Direct GPU comm path is being used with CUDA_AWARE_MPI
+ // make sure underlying MPI implementation is CUDA-aware
+ if (!GMX_THREAD_MPI && (devFlags.enableGpuPmePPComm || devFlags.enableGpuHaloExchange))
+ {
+ const bool haveDetectedCudaAwareMpi =
+ (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
+ const bool forceCudaAwareMpi = (getenv("GMX_FORCE_CUDA_AWARE_MPI") != nullptr);
+
+ if (!haveDetectedCudaAwareMpi && forceCudaAwareMpi)
+ {
+ // CUDA-aware support not detected in MPI library but, user has forced it's use
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "This run has forced use of 'CUDA-aware MPI'. "
+ "But, GROMACS cannot determine if underlying MPI "
+ "is CUDA-aware. GROMACS recommends use of latest openMPI version "
+ "for CUDA-aware support. "
+ "If you observe failures at runtime, try unsetting "
+ "GMX_FORCE_CUDA_AWARE_MPI environment variable.");
+ }
-#pragma GCC diagnostic pop
+ if (haveDetectedCudaAwareMpi || forceCudaAwareMpi)
+ {
+ devFlags.usingCudaAwareMpi = true;
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "Using CUDA-aware MPI for 'GPU halo exchange' or 'GPU PME-PP "
+ "communications' feature.");
+ }
+ else
+ {
+ if (devFlags.enableGpuHaloExchange)
+ {
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
+ "halo exchange' feature will not be enabled as GROMACS couldn't "
+ "detect CUDA_aware support in underlying MPI implementation.");
+ devFlags.enableGpuHaloExchange = false;
+ }
+ if (devFlags.enableGpuPmePPComm)
+ {
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText(
+ "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
+ "'GPU PME-PP communications' feature will not be enabled as "
+ "GROMACS couldn't "
+ "detect CUDA_aware support in underlying MPI implementation.");
+ devFlags.enableGpuPmePPComm = false;
+ }
+
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "GROMACS recommends use of latest OpenMPI version for CUDA-aware "
+ "support. "
+ "If you are certain about CUDA-aware support in your MPI library, "
+ "you can force it's use by setting environment variable "
+ " GMX_FORCE_CUDA_AWARE_MPI.");
+ }
+ }
if (devFlags.enableGpuBufferOps)
{
{
try
{
- auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
+ const auto* masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
/* copy the arg list to make sure that it's thread-local. This
doesn't copy pointed-to items, of course; fnm, cr and fplog
are reset in the call below, all others should be const. */
t_commrec* cr,
t_inputrec* ir,
int nstlist_cmdline,
- const gmx_mtop_t* mtop,
+ const gmx_mtop_t& mtop,
const matrix box,
bool makeGpuPairList,
const gmx::CpuInfo& cpuinfo)
{
// We checked the cut-offs in grompp, but double-check here.
// We have PME+LJcutoff kernels for rcoulomb>rvdw.
- if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == eelCUT)
+ if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == VanDerWaalsType::Cut)
{
GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
"With Verlet lists and PME we should have rcoulomb>=rvdw");
VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
const real rlist_new =
- calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
+ calcVerletBufferSize(mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
if (rlist_new != ir->rlist)
{
if (EI_DYNAMICS(ir->eI))
{
/* Set or try nstlist values */
- increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+ increaseNstlist(fplog, cr, ir, nstlist_cmdline, &mtop, box, makeGpuPairList, cpuinfo);
}
}
static void finish_run(FILE* fplog,
const gmx::MDLogger& mdlog,
const t_commrec* cr,
- const t_inputrec* inputrec,
+ const t_inputrec& inputrec,
t_nrnb nrnb[],
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_walltime_accounting_t walltime_accounting,
nonbonded_verlet_t* nbv,
const gmx_pme_t* pme,
Further, we only report performance for dynamical integrators,
because those are the only ones for which we plan to
consider doing any optimizations. */
- bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
+ bool printReport = EI_DYNAMICS(inputrec.eI) && SIMMASTER(cr);
if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
{
nrnbTotalStorage = std::make_unique<t_nrnb>();
nrnb_tot = nrnbTotalStorage.get();
#if GMX_MPI
- MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+ MPI_Allreduce(nrnb->n.data(), nrnb_tot->n.data(), eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
#endif
}
else
print_flop(fplog, nrnb_tot, &nbfs, &mflop);
}
- if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
+ if (thisRankHasDuty(cr, DUTY_PP) && haveDDAtomOrdering(*cr))
{
print_dd_statistics(cr, inputrec, fplog);
}
* to the code that handled the thread region, so that there's a
* mechanism to keep cycle counting working during the transition
* to task parallelism. */
- int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
- int nthreads_pme = gmx_omp_nthreads_get(emntPME);
+ int nthreads_pp = gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded);
+ int nthreads_pme = gmx_omp_nthreads_get(ModuleMultiThread::Pme);
wallcycle_scale_by_num_threads(
wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
auto cycle_sum(wallcycle_sum(cr, wcycle));
if (printReport)
{
- auto nbnxn_gpu_timings =
+ auto* nbnxn_gpu_timings =
(nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
nbnxn_gpu_timings,
&pme_gpu_timings);
- if (EI_DYNAMICS(inputrec->eI))
+ if (EI_DYNAMICS(inputrec.eI))
{
- delta_t = inputrec->delta_t;
+ delta_t = inputrec.delta_t;
}
if (fplog)
int Mdrunner::mdrunner()
{
- matrix box;
- t_forcerec* fr = nullptr;
- real ewaldcoeff_q = 0;
- real ewaldcoeff_lj = 0;
- int nChargePerturbed = -1, nTypePerturbed = 0;
- gmx_wallcycle_t wcycle;
- gmx_walltime_accounting_t walltime_accounting = nullptr;
- MembedHolder membedHolder(filenames.size(), filenames.data());
+ matrix box;
+ std::unique_ptr<t_forcerec> fr;
+ real ewaldcoeff_q = 0;
+ real ewaldcoeff_lj = 0;
+ int nChargePerturbed = -1, nTypePerturbed = 0;
+ gmx_walltime_accounting_t walltime_accounting = nullptr;
+ MembedHolder membedHolder(filenames.size(), filenames.data());
/* CAUTION: threads may be started later on in this function, so
cr doesn't reflect the final parallel state right now */
gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo_);
- std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo_->deviceInfoList, hw_opt.gpuIdsAvailable);
- const int numDevicesToUse = gmx::ssize(gpuIdsToUse);
+ std::vector<int> availableDevices =
+ makeListOfAvailableDevices(hwinfo_->deviceInfoList, hw_opt.devicesSelectedByUser);
+ const int numAvailableDevices = gmx::ssize(availableDevices);
// Print citation requests after all software/hardware printing
pleaseCiteGromacs(fplog);
auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi(
nonbondedTarget,
- numDevicesToUse,
+ numAvailableDevices > 0,
userGpuTaskAssignment,
emulateGpuNonbonded,
canUseGpuForNonbonded,
hw_opt.nthreads_tmpi);
useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(useGpuForNonbonded,
pmeTarget,
- numDevicesToUse,
+ numAvailableDevices,
userGpuTaskAssignment,
*hwinfo_,
*inputrec,
* correctly. */
hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo_,
&hw_opt,
- numDevicesToUse,
+ numAvailableDevices,
useGpuForNonbonded,
useGpuForPme,
inputrec.get(),
- &mtop,
+ mtop,
mdlog,
membedHolder.doMembed());
// master and spawned threads joins at the end of this block.
}
- GMX_RELEASE_ASSERT(ms || simulationCommunicator != MPI_COMM_NULL,
+ GMX_RELEASE_ASSERT(!GMX_MPI || ms || simulationCommunicator != MPI_COMM_NULL,
"Must have valid communicator unless running a multi-simulation");
CommrecHandle crHandle = init_commrec(simulationCommunicator);
t_commrec* cr = crHandle.get();
"Linear acceleration has been removed in GROMACS 2022, and was broken for many years "
"before that. Use GROMACS 4.5 or earlier if you need this feature.");
+ // Now we decide whether to use the domain decomposition machinery.
+ // Note that this does not necessarily imply actually using multiple domains.
// Now the number of ranks is known to all ranks, and each knows
// the inputrec read by the master rank. The ranks can now all run
// the task-deciding functions and will agree on the result
// without needing to communicate.
- const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
+ // The LBFGS minimizer, test-particle insertion, normal modes and shell dynamics don't support DD
+ const bool useDomainDecomposition =
+ !(inputrec->eI == IntegrationAlgorithm::LBFGS || EI_TPI(inputrec->eI)
+ || inputrec->eI == IntegrationAlgorithm::NM
+ || gmx_mtop_particletype_count(mtop)[ParticleType::Shell] > 0);
// Note that these variables describe only their own node.
//
doEssentialDynamics,
membedHolder.doMembed());
+ ObservablesReducerBuilder observablesReducerBuilder;
+
// Build restraints.
// TODO: hide restraint implementation details from Mdrunner.
// There is nothing unique about restraints at this point as far as the
// TODO: Error handling
mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
- // now that the MdModules know their options, they know which callbacks to sign up to
+ // now that the MDModules know their options, they know which callbacks to sign up to
mdModules_->subscribeToSimulationSetupNotifications();
- const auto& mdModulesNotifier = mdModules_->notifier().simulationSetupNotifications_;
+ const auto& setupNotifier = mdModules_->notifiers().simulationSetupNotifier_;
+
+ // Notify MdModules of existing logger
+ setupNotifier.notify(mdlog);
+ // Notify MdModules of internal parameters, saved into KVT
if (inputrec->internalParameters != nullptr)
{
- mdModulesNotifier.notify(*inputrec->internalParameters);
+ setupNotifier.notify(*inputrec->internalParameters);
+ }
+
+ // Let MdModules know the .tpr filename
+ {
+ gmx::MdRunInputFilename mdRunInputFilename = { ftp2fn(efTPR, filenames.size(), filenames.data()) };
+ setupNotifier.notify(mdRunInputFilename);
}
if (fplog != nullptr)
if (SIMMASTER(cr))
{
/* In rerun, set velocities to zero if present */
- if (doRerun && ((globalState->flags & (1 << estV)) != 0))
+ if (doRerun && ((globalState->flags & enumValueToBitMask(StateEntry::V)) != 0))
{
// rerun does not use velocities
GMX_LOG(mdlog.info)
{
clear_rvec(globalState->v[i]);
}
- globalState->flags &= ~(1 << estV);
+ globalState->flags &= ~enumValueToBitMask(StateEntry::V);
}
/* now make sure the state is initialized and propagated */
/* NM and TPI parallelize over force/energy calculations, not atoms,
* so we need to initialize and broadcast the global state.
*/
- if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+ if (inputrec->eI == IntegrationAlgorithm::NM || inputrec->eI == IntegrationAlgorithm::TPI)
{
if (!MASTER(cr))
{
globalState = std::make_unique<t_state>();
}
broadcastStateWithoutDynamics(
- cr->mpiDefaultCommunicator, DOMAINDECOMP(cr), PAR(cr), globalState.get());
+ cr->mpiDefaultCommunicator, haveDDAtomOrdering(*cr), PAR(cr), globalState.get());
}
/* A parallel command line option consistency check that we can
#endif
}
- if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+ if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || IntegrationAlgorithm::NM == inputrec->eI))
{
gmx_fatal(FARGS,
"The .mdp file specified an energy mininization or normal mode algorithm, and "
"these are not compatible with mdrun -rerun");
}
- /* Object for collecting reasons for not using PME-only ranks */
- SeparatePmeRanksPermitted separatePmeRanksPermitted;
-
- /* Permit MDModules to notify whether they want to use PME-only ranks */
- mdModulesNotifier.notify(&separatePmeRanksPermitted);
-
- /* If simulation is not using PME then disable PME-only ranks */
- if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
- {
- separatePmeRanksPermitted.disablePmeRanks(
- "PME-only ranks are requested, but the system does not use PME "
- "for electrostatics or LJ");
- }
-
- /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
- * improve performance with many threads per GPU, since our OpenMP
- * scaling is bad, but it's difficult to automate the setup.
- */
- if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
- {
- separatePmeRanksPermitted.disablePmeRanks(
- "PME-only CPU ranks are not automatically used when "
- "non-bonded interactions are computed on GPUs");
- }
-
- /* If GPU is used for PME then only 1 PME rank is permitted */
- if (useGpuForPme && (domdecOptions.numPmeRanks < 0 || domdecOptions.numPmeRanks > 1))
- {
- separatePmeRanksPermitted.disablePmeRanks(
- "PME GPU decomposition is not supported. Only one separate PME-only GPU rank "
- "can be used.");
- }
-
- /* Disable PME-only ranks if some parts of the code requested so and it's up to GROMACS to decide */
- if (!separatePmeRanksPermitted.permitSeparatePmeRanks() && domdecOptions.numPmeRanks < 0)
- {
- domdecOptions.numPmeRanks = 0;
- GMX_LOG(mdlog.info)
- .asParagraph()
- .appendText("Simulation will not use PME-only ranks because: "
- + separatePmeRanksPermitted.reasonsWhyDisabled());
- }
-
- /* If some parts of the code could not use PME-only ranks and
- * user explicitly used mdrun -npme option then throw an error */
- if (!separatePmeRanksPermitted.permitSeparatePmeRanks() && domdecOptions.numPmeRanks > 0)
- {
- gmx_fatal_collective(FARGS,
- cr->mpiDefaultCommunicator,
- MASTER(cr),
- "Requested -npme %d option is not viable because: %s",
- domdecOptions.numPmeRanks,
- separatePmeRanksPermitted.reasonsWhyDisabled().c_str());
- }
-
/* NMR restraints must be initialized before load_checkpoint,
* since with time averaging the history is added to t_state.
* For proper consistency check we therefore need to extend
t_disresdata* disresdata;
snew(disresdata, 1);
init_disres(fplog,
- &mtop,
+ mtop,
inputrec.get(),
DisResRunMode::MDRun,
MASTER(cr) ? DDRole::Master : DDRole::Agent,
globalState.get(),
replExParams.exchangeInterval > 0);
- t_oriresdata* oriresdata;
- snew(oriresdata, 1);
- init_orires(fplog, &mtop, inputrec.get(), cr, ms, globalState.get(), oriresdata);
+ if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0 && isSimulationMasterRank)
+ {
+ extendStateWithOriresHistory(mtop, *inputrec, globalState.get());
+ }
auto deform = prepareBoxDeformation(globalState != nullptr ? globalState->box : box,
MASTER(cr) ? DDRole::Master : DDRole::Agent,
globalState.get(),
&observablesHistory,
mdrunOptions.reproducible,
- mdModules_->notifier(),
+ mdModules_->notifiers(),
modularSimulatorCheckpointData.get(),
useModularSimulator);
// TODO: (#3652) Synchronize filesystem state, SimulationInput contents, and program
gmx_bcast(sizeof(box), box, cr->mpiDefaultCommunicator);
}
- if (inputrec->cutoff_scheme != ecutsVERLET)
+ if (inputrec->cutoff_scheme != CutoffScheme::Verlet)
{
gmx_fatal(FARGS,
"This group-scheme .tpr file can no longer be run by mdrun. Please update to the "
cr,
inputrec.get(),
nstlist_cmdline,
- &mtop,
+ mtop,
box,
useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
*hwinfo_->cpuInfo);
+ // We need to decide on update groups early, as this affects
+ // inter-domain communication distances.
+ auto updateGroupingsPerMoleculeType = makeUpdateGroupingsPerMoleculeType(mtop);
+ const real maxUpdateGroupRadius = computeMaxUpdateGroupRadius(
+ mtop, updateGroupingsPerMoleculeType, maxReferenceTemperature(*inputrec));
+ const real cutoffMargin = std::sqrt(max_cutoff2(inputrec->pbcType, box)) - inputrec->rlist;
+ UpdateGroups updateGroups = makeUpdateGroups(mdlog,
+ std::move(updateGroupingsPerMoleculeType),
+ maxUpdateGroupRadius,
+ useDomainDecomposition,
+ systemHasConstraintsOrVsites(mtop),
+ cutoffMargin);
+
+ try
+ {
+ const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
+
+ useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
+ updateGroups.useUpdateGroups(),
+ pmeRunMode,
+ domdecOptions.numPmeRanks > 0,
+ useGpuForNonbonded,
+ updateTarget,
+ gpusWereDetected,
+ *inputrec,
+ mtop,
+ doEssentialDynamics,
+ gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+ haveFrozenAtoms,
+ doRerun,
+ devFlags,
+ mdlog);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+
+ bool useGpuDirectHalo = false;
+
+ if (useGpuForNonbonded)
+ {
+ // cr->npmenodes is not yet initialized.
+ // domdecOptions.numPmeRanks == -1 results in 0 separate PME ranks when useGpuForNonbonded is true.
+ // Todo: remove this assumption later once auto mode has support for separate PME rank
+ const int numPmeRanks = domdecOptions.numPmeRanks > 0 ? domdecOptions.numPmeRanks : 0;
+ bool havePPDomainDecomposition = (cr->sizeOfDefaultCommunicator - numPmeRanks) > 1;
+ useGpuDirectHalo = decideWhetherToUseGpuForHalo(devFlags,
+ havePPDomainDecomposition,
+ useGpuForNonbonded,
+ useModularSimulator,
+ doRerun,
+ EI_ENERGY_MINIMIZATION(inputrec->eI));
+ }
+
// This builder is necessary while we have multi-part construction
// of DD. Before DD is constructed, we use the existence of
// the builder object to indicate that further construction of DD
std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
if (useDomainDecomposition)
{
- ddBuilder = std::make_unique<DomainDecompositionBuilder>(
+ // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple
+ // timesteps. DLB needs to be disabled in that case
+ const bool directGpuCommUsedWithGpuUpdate = GMX_THREAD_MPI && useGpuDirectHalo && useGpuForUpdate;
+ ddBuilder = std::make_unique<DomainDecompositionBuilder>(
mdlog,
cr,
domdecOptions,
mdrunOptions,
mtop,
*inputrec,
+ mdModules_->notifiers(),
box,
- positionsFromStatePointer(globalState.get()));
+ updateGroups.updateGroupingPerMoleculeType(),
+ updateGroups.useUpdateGroups(),
+ updateGroups.maxUpdateGroupRadius(),
+ positionsFromStatePointer(globalState.get()),
+ useGpuForNonbonded,
+ useGpuForPme,
+ directGpuCommUsedWithGpuUpdate);
}
else
{
// Produce the task assignment for this rank - done after DD is constructed
GpuTaskAssignments gpuTaskAssignments = GpuTaskAssignmentsBuilder::build(
- gpuIdsToUse,
+ availableDevices,
userGpuTaskAssignment,
*hwinfo_,
simulationCommunicator,
// requires it (e.g. pull, CompEl, density fitting), so that we
// don't update the local atom sets unilaterally every step.
LocalAtomSetManager atomSets;
+
+ // Local state and topology are declared (and perhaps constructed)
+ // now, because DD needs them for the LocalTopologyChecker, but
+ // they do not contain valid data until after the first DD
+ // partition.
+ std::unique_ptr<t_state> localStateInstance;
+ t_state* localState;
+ gmx_localtop_t localTopology(mtop.ffparams);
+
if (ddBuilder)
{
+ localStateInstance = std::make_unique<t_state>();
+ localState = localStateInstance.get();
// TODO Pass the GPU streams to ddBuilder to use in buffer
// transfers (e.g. halo exchange)
- cr->dd = ddBuilder->build(&atomSets);
+ cr->dd = ddBuilder->build(&atomSets, localTopology, *localState, &observablesReducerBuilder);
// The builder's job is done, so destruct it
ddBuilder.reset(nullptr);
// Note that local state still does not exist yet.
}
+ else
+ {
+ // Without DD, the local state is merely an alias to the global state,
+ // so we don't need to allocate anything.
+ localState = globalState.get();
+ }
- // The GPU update is decided here because we need to know whether the constraints or
- // SETTLEs can span accross the domain borders (i.e. whether or not update groups are
- // defined). This is only known after DD is initialized, hence decision on using GPU
- // update is done so late.
- try
+ // Ensure that all atoms within the same update group are in the
+ // same periodic image. Otherwise, a simulation that did not use
+ // update groups (e.g. a single-rank simulation) cannot always be
+ // correctly restarted in a way that does use update groups
+ // (e.g. a multi-rank simulation).
+ if (isSimulationMasterRank)
{
const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
-
- useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
- useUpdateGroups,
- pmeRunMode,
- domdecOptions.numPmeRanks > 0,
- useGpuForNonbonded,
- updateTarget,
- gpusWereDetected,
- *inputrec,
- mtop,
- doEssentialDynamics,
- gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
- replExParams.exchangeInterval > 0,
- doRerun,
- devFlags,
- mdlog);
+ if (useUpdateGroups)
+ {
+ putUpdateGroupAtomsInSamePeriodicImage(*cr->dd, mtop, globalState->box, globalState->x);
+ }
}
- GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
const bool printHostName = (cr->nnodes > 1);
gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
MdrunScheduleWorkload runScheduleWork;
- bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(devFlags,
- havePPDomainDecomposition(cr),
- useGpuForNonbonded,
- useModularSimulator,
- doRerun,
- EI_ENERGY_MINIMIZATION(inputrec->eI));
-
// Also populates the simulation constant workload description.
+ // Note: currently the default duty is DUTY_PP | DUTY_PME for all simulations, including those without PME,
+ // so this boolean is sufficient on all ranks to determine whether separate PME ranks are used,
+ // but this will no longer be the case if cr->duty is changed for !EEL_PME(fr->ic->eeltype).
+ const bool haveSeparatePmeRank = (!thisRankHasDuty(cr, DUTY_PP) || !thisRankHasDuty(cr, DUTY_PME));
runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
disableNonbondedCalculation,
devFlags,
+ havePPDomainDecomposition(cr),
+ haveSeparatePmeRank,
useGpuForNonbonded,
pmeRunMode,
useGpuForBonded,
if (deviceInfo != nullptr)
{
- if (DOMAINDECOMP(cr) && thisRankHasDuty(cr, DUTY_PP))
+ if (runScheduleWork.simulationWork.havePpDomainDecomposition && thisRankHasDuty(cr, DUTY_PP))
{
dd_setup_dlb_resource_sharing(cr, deviceId);
}
// where appropriate.
if (!userGpuTaskAssignment.empty())
{
- gpuTaskAssignments.logPerformanceHints(mdlog, numDevicesToUse);
+ gpuTaskAssignments.logPerformanceHints(mdlog, numAvailableDevices);
}
if (PAR(cr))
PME: env variable should be read only on one node to make sure it is
identical everywhere;
*/
- const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
- : gmx_omp_nthreads_get(emntPME);
+ const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP)
+ ? gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded)
+ : gmx_omp_nthreads_get(ModuleMultiThread::Pme);
checkHardwareOversubscription(
numThreadsOnThisRank, cr->nodeid, *hwinfo_->hardwareTopology, physicalNodeComm, mdlog);
// Enable Peer access between GPUs where available
// Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
// any of the GPU communication features are active.
- if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
+ if (haveDDAtomOrdering(*cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
&& (runScheduleWork.simulationWork.useGpuHaloExchange
|| runScheduleWork.simulationWork.useGpuPmePpCommunication))
{
- setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
+ setupGpuDevicePeerAccess(gpuTaskAssignments.deviceIdsAssigned(), mdlog);
}
if (hw_opt.threadAffinity != ThreadAffinity::Off)
"The -resetstep functionality is deprecated, and may be removed in a "
"future version.");
}
- wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+ std::unique_ptr<gmx_wallcycle> wcycle =
+ wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
if (PAR(cr))
{
/* Master synchronizes its value of reset_counters with all nodes
* including PME only nodes */
- int64_t reset_counters = wcycle_get_reset_counters(wcycle);
+ int64_t reset_counters = wcycle_get_reset_counters(wcycle.get());
gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim);
- wcycle_set_reset_counters(wcycle, reset_counters);
+ wcycle_set_reset_counters(wcycle.get(), reset_counters);
}
// Membrane embedding must be initialized before we call init_forcerec()
const bool thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
std::unique_ptr<MDAtoms> mdAtoms;
std::unique_ptr<VirtualSitesHandler> vsite;
- std::unique_ptr<GpuBonded> gpuBonded;
t_nrnb nrnb;
if (thisRankHasDuty(cr, DUTY_PP))
{
- mdModulesNotifier.notify(*cr);
- mdModulesNotifier.notify(&atomSets);
- mdModulesNotifier.notify(mtop);
- mdModulesNotifier.notify(inputrec->pbcType);
- mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
+ setupNotifier.notify(*cr);
+ setupNotifier.notify(&atomSets);
+ setupNotifier.notify(mtop);
+ setupNotifier.notify(inputrec->pbcType);
+ setupNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
/* Initiate forcerecord */
- fr = new t_forcerec;
+ fr = std::make_unique<t_forcerec>();
fr->forceProviders = mdModules_->initForceProviders();
init_forcerec(fplog,
mdlog,
- fr,
- inputrec.get(),
- &mtop,
+ runScheduleWork.simulationWork,
+ fr.get(),
+ *inputrec,
+ mtop,
cr,
box,
opt2fn("-table", filenames.size(), filenames.data()),
pforce);
// Dirty hack, for fixing disres and orires should be made mdmodules
fr->fcdata->disres = disresdata;
- fr->fcdata->orires = oriresdata;
+ if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
+ {
+ fr->fcdata->orires = std::make_unique<t_oriresdata>(
+ fplog, mtop, *inputrec, ms, globalState.get(), &atomSets);
+ }
// Save a handle to device stream manager to use elsewhere in the code
// TODO: Forcerec is not a correct place to store it.
fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
cr->mpi_comm_mysim,
cr->dd->pme_nodeid,
+ &cr->dd->pmeForceReceiveBuffer,
deviceStreamManager->context(),
deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
}
fr->nbv = Nbnxm::init_nb_verlet(mdlog,
- inputrec.get(),
- fr,
+ *inputrec,
+ *fr,
cr,
*hwinfo_,
runScheduleWork.simulationWork.useGpuNonbonded,
deviceStreamManager.get(),
- &mtop,
+ mtop,
box,
- wcycle);
+ wcycle.get());
// TODO: Move the logic below to a GPU bonded builder
if (runScheduleWork.simulationWork.useGpuBonded)
{
GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
"GPU device stream manager should be valid in order to use GPU "
"version of bonded forces.");
- gpuBonded = std::make_unique<GpuBonded>(
+ fr->listedForcesGpu = std::make_unique<ListedForcesGpu>(
mtop.ffparams,
fr->ic->epsfac * fr->fudgeQQ,
deviceStreamManager->context(),
deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)),
- wcycle);
- fr->gpuBonded = gpuBonded.get();
+ wcycle.get());
}
+ fr->longRangeNonbondeds = std::make_unique<CpuPpLongRangeNonbondeds>(fr->n_tpi,
+ fr->ic->ewaldcoeff_q,
+ fr->ic->epsilon_r,
+ fr->qsum,
+ fr->ic->eeltype,
+ fr->ic->vdwtype,
+ *inputrec,
+ &nrnb,
+ wcycle.get(),
+ fplog);
/* Initialize the mdAtoms structure.
* mdAtoms is not filled with atom data,
}
/* Initialize the virtual site communication */
- vsite = makeVirtualSitesHandler(mtop, cr, fr->pbcType);
+ vsite = makeVirtualSitesHandler(
+ mtop, cr, fr->pbcType, updateGroups.updateGroupingPerMoleculeType());
calc_shifts(box, fr->shift_vec);
constructVirtualSitesGlobal(mtop, globalState->x);
}
}
+ // Make the DD reverse topology, now that any vsites that are present are available
+ if (haveDDAtomOrdering(*cr))
+ {
+ dd_make_reverse_top(fplog, cr->dd, mtop, vsite.get(), *inputrec, domdecOptions.ddBondedChecking);
+ }
if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
{
const DeviceContext* deviceContext = runScheduleWork.simulationWork.useGpuPme
? &deviceStreamManager->context()
: nullptr;
- const DeviceStream* pmeStream =
+ const DeviceStream* pmeStream =
runScheduleWork.simulationWork.useGpuPme
- ? &deviceStreamManager->stream(DeviceStreamType::Pme)
- : nullptr;
+ ? &deviceStreamManager->stream(DeviceStreamType::Pme)
+ : nullptr;
pmedata = gmx_pme_init(cr,
getNumPmeDomains(cr->dd),
mdrunOptions.reproducible,
ewaldcoeff_q,
ewaldcoeff_lj,
- gmx_omp_nthreads_get(emntPME),
+ gmx_omp_nthreads_get(ModuleMultiThread::Pme),
pmeRunMode,
nullptr,
deviceContext,
if (thisRankHasDuty(cr, DUTY_PP))
{
/* Assumes uniform use of the number of OpenMP threads */
- walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+ walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(ModuleMultiThread::Default));
if (inputrec->bPull)
{
pull_work = init_pull(fplog,
inputrec->pull.get(),
inputrec.get(),
- &mtop,
+ mtop,
cr,
&atomSets,
inputrec->fepvals->init_lambda);
cr,
&atomSets,
globalState.get(),
- &mtop,
+ mtop,
oenv,
mdrunOptions,
startingBehavior);
}
t_swap* swap = nullptr;
- if (inputrec->eSwapCoords != eswapNO)
+ if (inputrec->eSwapCoords != SwapType::No)
{
/* Initialize ion swapping code */
swap = init_swapcoords(fplog,
inputrec.get(),
opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
- &mtop,
+ mtop,
globalState.get(),
&observablesHistory,
cr,
}
/* Let makeConstraints know whether we have essential dynamics constraints. */
- auto constr = makeConstraints(
- mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr, ms, &nrnb, wcycle, fr->bMolPBC);
+ auto constr = makeConstraints(mtop,
+ *inputrec,
+ pull_work,
+ doEssentialDynamics,
+ fplog,
+ cr,
+ updateGroups.useUpdateGroups(),
+ ms,
+ &nrnb,
+ wcycle.get(),
+ fr->bMolPBC,
+ &observablesReducerBuilder);
/* Energy terms and groups */
gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
// cos acceleration is only supported by md, but older tpr
// files might still combine it with other integrators
- GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == eiMD,
+ GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == IntegrationAlgorithm::MD,
"cos_acceleration is only supported by integrator=md");
/* Kinetic energy data */
- gmx_ekindata_t ekind;
- init_ekindata(fplog, &(inputrec->opts), &ekind, inputrec->cos_accel);
+ gmx_ekindata_t ekind(inputrec->opts.ngtc,
+ inputrec->cos_accel,
+ gmx_omp_nthreads_get(ModuleMultiThread::Update));
/* Set up interactive MD (IMD) */
auto imdSession = makeImdSession(inputrec.get(),
cr,
- wcycle,
+ wcycle.get(),
&enerd,
ms,
- &mtop,
+ mtop,
mdlog,
- MASTER(cr) ? globalState->x.rvec_array() : nullptr,
+ MASTER(cr) ? globalState->x : gmx::ArrayRef<gmx::RVec>(),
filenames.size(),
filenames.data(),
oenv,
mdrunOptions.imdOptions,
startingBehavior);
- if (DOMAINDECOMP(cr))
+ if (haveDDAtomOrdering(*cr))
{
GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
- /* This call is not included in init_domain_decomposition mainly
- * because fr->cginfo_mb is set later.
+ /* This call is not included in init_domain_decomposition
+ * because fr->atomInfoForEachMoleculeBlock is set later.
*/
- dd_init_bondeds(fplog,
- cr->dd,
- mtop,
- vsite.get(),
- inputrec.get(),
- domdecOptions.checkBondedInteractions,
- fr->cginfo_mb);
+ makeBondedLinks(cr->dd, mtop, fr->atomInfoForEachMoleculeBlock);
}
if (runScheduleWork.simulationWork.useGpuBufferOps)
fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal),
- wcycle);
+ wcycle.get());
fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
deviceStreamManager->context(),
deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal),
- wcycle);
+ wcycle.get());
}
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
&& ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
|| runScheduleWork.simulationWork.useGpuBufferOps))
{
- GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
- ? GpuApiCallBehavior::Async
- : GpuApiCallBehavior::Sync;
+ GpuApiCallBehavior transferKind =
+ (inputrec->eI == IntegrationAlgorithm::MD && !doRerun && !useModularSimulator)
+ ? GpuApiCallBehavior::Async
+ : GpuApiCallBehavior::Sync;
GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
"GPU device stream manager should be initialized to use GPU.");
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
- *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
+ *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle.get());
fr->stateGpu = stateGpu.get();
}
GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
SimulatorBuilder simulatorBuilder;
- simulatorBuilder.add(SimulatorStateData(globalState.get(), &observablesHistory, &enerd, &ekind));
+ simulatorBuilder.add(SimulatorStateData(
+ globalState.get(), localState, &observablesHistory, &enerd, &ekind));
simulatorBuilder.add(std::move(membedHolder));
simulatorBuilder.add(std::move(stopHandlerBuilder_));
simulatorBuilder.add(SimulatorConfig(mdrunOptions, startingBehavior, &runScheduleWork));
- simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv));
- simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle));
+ simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv, &observablesReducerBuilder));
+ simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle.get()));
simulatorBuilder.add(ConstraintsParam(
constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, vsite.get()));
// TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly.
simulatorBuilder.add(LegacyInput(
- static_cast<int>(filenames.size()), filenames.data(), inputrec.get(), fr));
+ static_cast<int>(filenames.size()), filenames.data(), inputrec.get(), fr.get()));
simulatorBuilder.add(ReplicaExchangeParameters(replExParams));
simulatorBuilder.add(InteractiveMD(imdSession.get()));
- simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifier()));
+ simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifiers()));
simulatorBuilder.add(CenterOfMassPulling(pull_work));
// Todo move to an MDModule
simulatorBuilder.add(IonSwapping(swap));
- simulatorBuilder.add(TopologyData(&mtop, mdAtoms.get()));
+ simulatorBuilder.add(TopologyData(mtop, &localTopology, mdAtoms.get()));
simulatorBuilder.add(BoxDeformationHandle(deform.get()));
simulatorBuilder.add(std::move(modularSimulatorCheckpointData));
{
GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
/* do PME only */
- walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
+ walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(ModuleMultiThread::Pme));
gmx_pmeonly(pmedata,
cr,
&nrnb,
- wcycle,
+ wcycle.get(),
walltime_accounting,
inputrec.get(),
pmeRunMode,
+ runScheduleWork.simulationWork.useGpuPmePpCommunication,
deviceStreamManager.get());
}
- wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_stop(wcycle.get(), WallCycleCounter::Run);
/* Finish up, write some stuff
* if rerunMD, don't write last frame again
finish_run(fplog,
mdlog,
cr,
- inputrec.get(),
+ *inputrec,
&nrnb,
- wcycle,
+ wcycle.get(),
walltime_accounting,
fr ? fr->nbv.get() : nullptr,
pmedata,
EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
- // clean up cycle counter
- wallcycle_destroy(wcycle);
deviceStreamManager.reset(nullptr);
// Free PME data
// As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
mdAtoms.reset(nullptr);
globalState.reset(nullptr);
+ localStateInstance.reset(nullptr);
mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
- gpuBonded.reset(nullptr);
- /* Free pinned buffers in *fr */
- delete fr;
- fr = nullptr;
+ fr.reset(nullptr); // destruct forcerec before gpu
// TODO convert to C++ so we can get rid of these frees
sfree(disresdata);
- sfree(oriresdata);
if (!hwinfo_->deviceInfoList.empty())
{
{
physicalNodeComm.barrier();
}
- releaseDevice(deviceInfo);
+
+ if (!devFlags.usingCudaAwareMpi)
+ {
+ // Don't reset GPU in case of CUDA-AWARE MPI
+ // UCX creates CUDA buffers which are cleaned-up as part of MPI_Finalize()
+ // resetting the device before MPI_Finalize() results in crashes inside UCX
+ releaseDevice(deviceInfo);
+ }
/* Does what it says */
print_date_and_time(fplog, cr->nodeid, "Finished mdrun", gmx_gettime());