#include "gromacs/domdec/domdec_struct.h"
#include "gromacs/domdec/gpuhaloexchange.h"
#include "gromacs/domdec/localatomsetmanager.h"
+#include "gromacs/domdec/makebondedlinks.h"
#include "gromacs/domdec/partition.h"
+#include "gromacs/domdec/reversetopology.h"
#include "gromacs/ewald/ewald_utils.h"
#include "gromacs/ewald/pme.h"
#include "gromacs/ewald/pme_gpu_program.h"
+#include "gromacs/ewald/pme_only.h"
#include "gromacs/ewald/pme_pp_comm_gpu.h"
#include "gromacs/fileio/checkpoint.h"
#include "gromacs/fileio/gmxfio.h"
#include "gromacs/fileio/tpxio.h"
#include "gromacs/gmxlib/network.h"
#include "gromacs/gmxlib/nrnb.h"
-#include "gromacs/gpu_utils/gpu_utils.h"
+#include "gromacs/gpu_utils/device_stream_manager.h"
#include "gromacs/hardware/cpuinfo.h"
#include "gromacs/hardware/detecthardware.h"
+#include "gromacs/hardware/device_management.h"
+#include "gromacs/hardware/hardwaretopology.h"
#include "gromacs/hardware/printhardware.h"
#include "gromacs/imd/imd.h"
#include "gromacs/listed_forces/disre.h"
-#include "gromacs/listed_forces/gpubonded.h"
+#include "gromacs/listed_forces/listed_forces_gpu.h"
+#include "gromacs/listed_forces/listed_forces.h"
#include "gromacs/listed_forces/orires.h"
#include "gromacs/math/functions.h"
#include "gromacs/math/utilities.h"
#include "gromacs/mdlib/force.h"
#include "gromacs/mdlib/forcerec.h"
#include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/gpuforcereduction.h"
#include "gromacs/mdlib/makeconstraints.h"
#include "gromacs/mdlib/md_support.h"
#include "gromacs/mdlib/mdatoms.h"
-#include "gromacs/mdlib/membed.h"
-#include "gromacs/mdlib/qmmm.h"
#include "gromacs/mdlib/sighandler.h"
#include "gromacs/mdlib/stophandler.h"
+#include "gromacs/mdlib/tgroup.h"
#include "gromacs/mdlib/updategroups.h"
+#include "gromacs/mdlib/vsite.h"
#include "gromacs/mdrun/mdmodules.h"
#include "gromacs/mdrun/simulationcontext.h"
+#include "gromacs/mdrun/simulationinput.h"
+#include "gromacs/mdrun/simulationinputhandle.h"
#include "gromacs/mdrunutility/handlerestart.h"
#include "gromacs/mdrunutility/logging.h"
#include "gromacs/mdrunutility/multisim.h"
#include "gromacs/mdrunutility/printtime.h"
#include "gromacs/mdrunutility/threadaffinity.h"
+#include "gromacs/mdtypes/checkpointdata.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
#include "gromacs/mdtypes/group.h"
#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
#include "gromacs/mdtypes/mdrunoptions.h"
#include "gromacs/mdtypes/observableshistory.h"
+#include "gromacs/mdtypes/observablesreducer.h"
#include "gromacs/mdtypes/simulation_workload.h"
#include "gromacs/mdtypes/state.h"
#include "gromacs/mdtypes/state_propagator_data_gpu.h"
+#include "gromacs/modularsimulator/modularsimulator.h"
#include "gromacs/nbnxm/gpu_data_mgmt.h"
#include "gromacs/nbnxm/nbnxm.h"
#include "gromacs/nbnxm/pairlist_tuning.h"
#include "gromacs/utility/keyvaluetree.h"
#include "gromacs/utility/logger.h"
#include "gromacs/utility/loggerbuilder.h"
-#include "gromacs/utility/mdmodulenotification.h"
+#include "gromacs/utility/mdmodulesnotifiers.h"
#include "gromacs/utility/physicalnodecommunicator.h"
#include "gromacs/utility/pleasecite.h"
#include "gromacs/utility/programcontext.h"
#include "gromacs/utility/smalloc.h"
#include "gromacs/utility/stringutil.h"
+#include "gromacs/utility/mpiinfo.h"
#include "isimulator.h"
+#include "membedholder.h"
#include "replicaexchange.h"
#include "simulatorbuilder.h"
{
DevelopmentFeatureFlags devFlags;
- // Some builds of GCC 5 give false positive warnings that these
- // getenv results are ignored when clearly they are used.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-result"
- devFlags.enableGpuBufferOps = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr)
- && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
+ devFlags.enableGpuBufferOps = (GMX_GPU_CUDA || GMX_GPU_SYCL) && useGpuForNonbonded
+ && (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+ devFlags.enableGpuHaloExchange = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_DD_COMMS") != nullptr;
devFlags.forceGpuUpdateDefault = (getenv("GMX_FORCE_UPDATE_DEFAULT_GPU") != nullptr) || GMX_FAHCORE;
- devFlags.enableGpuHaloExchange =
- (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
- devFlags.enableGpuPmePPComm =
- (getenv("GMX_GPU_PME_PP_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
-#pragma GCC diagnostic pop
+ devFlags.enableGpuPmePPComm = GMX_MPI && GMX_GPU_CUDA && getenv("GMX_GPU_PME_PP_COMMS") != nullptr;
+
+ // Direct GPU comm path is being used with CUDA_AWARE_MPI
+ // make sure underlying MPI implementation is CUDA-aware
+ if (!GMX_THREAD_MPI && (devFlags.enableGpuPmePPComm || devFlags.enableGpuHaloExchange))
+ {
+ const bool haveDetectedCudaAwareMpi =
+ (checkMpiCudaAwareSupport() == CudaAwareMpiStatus::Supported);
+ const bool forceCudaAwareMpi = (getenv("GMX_FORCE_CUDA_AWARE_MPI") != nullptr);
+
+ if (!haveDetectedCudaAwareMpi && forceCudaAwareMpi)
+ {
+ // CUDA-aware support not detected in MPI library but, user has forced it's use
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "This run has forced use of 'CUDA-aware MPI'. "
+ "But, GROMACS cannot determine if underlying MPI "
+ "is CUDA-aware. GROMACS recommends use of latest openMPI version "
+ "for CUDA-aware support. "
+ "If you observe failures at runtime, try unsetting "
+ "GMX_FORCE_CUDA_AWARE_MPI environment variable.");
+ }
+
+ if (haveDetectedCudaAwareMpi || forceCudaAwareMpi)
+ {
+ devFlags.usingCudaAwareMpi = true;
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "Using CUDA-aware MPI for 'GPU halo exchange' or 'GPU PME-PP "
+ "communications' feature.");
+ }
+ else
+ {
+ if (devFlags.enableGpuHaloExchange)
+ {
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "GMX_GPU_DD_COMMS environment variable detected, but the 'GPU "
+ "halo exchange' feature will not be enabled as GROMACS couldn't "
+ "detect CUDA_aware support in underlying MPI implementation.");
+ devFlags.enableGpuHaloExchange = false;
+ }
+ if (devFlags.enableGpuPmePPComm)
+ {
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText(
+ "GMX_GPU_PME_PP_COMMS environment variable detected, but the "
+ "'GPU PME-PP communications' feature will not be enabled as "
+ "GROMACS couldn't "
+ "detect CUDA_aware support in underlying MPI implementation.");
+ devFlags.enableGpuPmePPComm = false;
+ }
+
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "GROMACS recommends use of latest OpenMPI version for CUDA-aware "
+ "support. "
+ "If you are certain about CUDA-aware support in your MPI library, "
+ "you can force it's use by setting environment variable "
+ " GMX_FORCE_CUDA_AWARE_MPI.");
+ }
+ }
if (devFlags.enableGpuBufferOps)
{
GMX_LOG(mdlog.warning)
.asParagraph()
.appendTextFormatted(
- "This run uses the 'GPU halo exchange' feature, enabled by the "
+ "This run has requested the 'GPU halo exchange' feature, enabled by "
+ "the "
"GMX_GPU_DD_COMMS environment variable.");
}
else
{
if (pmeRunMode == PmeRunMode::GPU)
{
+ if (!devFlags.enableGpuBufferOps)
+ {
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendTextFormatted(
+ "Enabling GPU buffer operations required by GMX_GPU_PME_PP_COMMS "
+ "(equivalent with GMX_USE_GPU_BUFFER_OPS=1).");
+ devFlags.enableGpuBufferOps = true;
+ }
GMX_LOG(mdlog.warning)
.asParagraph()
.appendTextFormatted(
// Copy members of master runner.
// \todo Replace with builder when Simulation context and/or runner phases are better defined.
- // Ref https://redmine.gromacs.org/issues/2587 and https://redmine.gromacs.org/issues/2375
+ // Ref https://gitlab.com/gromacs/gromacs/-/issues/2587 and https://gitlab.com/gromacs/gromacs/-/issues/2375
newRunner.hw_opt = hw_opt;
newRunner.filenames = filenames;
+ newRunner.hwinfo_ = hwinfo_;
newRunner.oenv = oenv;
newRunner.mdrunOptions = mdrunOptions;
newRunner.domdecOptions = domdecOptions;
newRunner.pforce = pforce;
// Give the spawned thread the newly created valid communicator
// for the simulation.
- newRunner.communicator = MPI_COMM_WORLD;
- newRunner.ms = ms;
- newRunner.startingBehavior = startingBehavior;
- newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+ newRunner.libraryWorldCommunicator = MPI_COMM_WORLD;
+ newRunner.simulationCommunicator = MPI_COMM_WORLD;
+ newRunner.ms = ms;
+ newRunner.startingBehavior = startingBehavior;
+ newRunner.stopHandlerBuilder_ = std::make_unique<StopHandlerBuilder>(*stopHandlerBuilder_);
+ newRunner.inputHolder_ = inputHolder_;
threadMpiMdrunnerAccessBarrier();
{
try
{
- auto masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
+ const auto* masterMdrunner = reinterpret_cast<const gmx::Mdrunner*>(arg);
/* copy the arg list to make sure that it's thread-local. This
doesn't copy pointed-to items, of course; fnm, cr and fplog
are reset in the call below, all others should be const. */
#if GMX_THREAD_MPI
/* now spawn new threads that start mdrunner_start_fn(), while
the main thread returns. Thread affinity is handled later. */
- if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, mdrunner_start_fn,
- static_cast<const void*>(this))
+ if (tMPI_Init_fn(TRUE, numThreadsToLaunch, TMPI_AFFINITY_NONE, mdrunner_start_fn, static_cast<const void*>(this))
!= TMPI_SUCCESS)
{
GMX_THROW(gmx::InternalError("Failed to spawn thread-MPI threads"));
// Give the master thread the newly created valid communicator for
// the simulation.
- communicator = MPI_COMM_WORLD;
+ libraryWorldCommunicator = MPI_COMM_WORLD;
+ simulationCommunicator = MPI_COMM_WORLD;
threadMpiMdrunnerAccessBarrier();
#else
GMX_UNUSED_VALUE(numThreadsToLaunch);
t_commrec* cr,
t_inputrec* ir,
int nstlist_cmdline,
- const gmx_mtop_t* mtop,
+ const gmx_mtop_t& mtop,
const matrix box,
bool makeGpuPairList,
const gmx::CpuInfo& cpuinfo)
{
+ // We checked the cut-offs in grompp, but double-check here.
+ // We have PME+LJcutoff kernels for rcoulomb>rvdw.
+ if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == VanDerWaalsType::Cut)
+ {
+ GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
+ "With Verlet lists and PME we should have rcoulomb>=rvdw");
+ }
+ else
+ {
+ GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw,
+ "With Verlet lists and no PME rcoulomb and rvdw should be identical");
+ }
/* For NVE simulations, we will retain the initial list buffer */
- if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
+ if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0
+ && !(EI_MD(ir->eI) && ir->etc == TemperatureCoupling::No))
{
/* Update the Verlet buffer size for the current run setup */
VerletbufListSetup listSetup = verletbufGetSafeListSetup(listType);
const real rlist_new =
- calcVerletBufferSize(*mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
+ calcVerletBufferSize(mtop, det(box), *ir, ir->nstlist, ir->nstlist - 1, -1, listSetup);
if (rlist_new != ir->rlist)
{
{
fprintf(fplog,
"\nChanging rlist from %g to %g for non-bonded %dx%d atom kernels\n\n",
- ir->rlist, rlist_new, listSetup.cluster_size_i, listSetup.cluster_size_j);
+ ir->rlist,
+ rlist_new,
+ listSetup.cluster_size_i,
+ listSetup.cluster_size_j);
}
ir->rlist = rlist_new;
}
if (nstlist_cmdline > 0 && (!EI_DYNAMICS(ir->eI) || ir->verletbuf_tol <= 0))
{
- gmx_fatal(FARGS, "Can not set nstlist without %s",
+ gmx_fatal(FARGS,
+ "Can not set nstlist without %s",
!EI_DYNAMICS(ir->eI) ? "dynamics" : "verlet-buffer-tolerance");
}
if (EI_DYNAMICS(ir->eI))
{
/* Set or try nstlist values */
- increaseNstlist(fplog, cr, ir, nstlist_cmdline, mtop, box, makeGpuPairList, cpuinfo);
+ increaseNstlist(fplog, cr, ir, nstlist_cmdline, &mtop, box, makeGpuPairList, cpuinfo);
}
}
{
sprintf(sbuf_msg,
"Overriding nsteps with value passed on the command line: %s steps, %.3g ps",
- gmx_step_str(nsteps_cmdline, sbuf_steps), fabs(nsteps_cmdline * ir->delta_t));
+ gmx_step_str(nsteps_cmdline, sbuf_steps),
+ fabs(nsteps_cmdline * ir->delta_t));
}
else
{
- sprintf(sbuf_msg, "Overriding nsteps with value passed on the command line: %s steps",
+ sprintf(sbuf_msg,
+ "Overriding nsteps with value passed on the command line: %s steps",
gmx_step_str(nsteps_cmdline, sbuf_steps));
}
static void finish_run(FILE* fplog,
const gmx::MDLogger& mdlog,
const t_commrec* cr,
- const t_inputrec* inputrec,
+ const t_inputrec& inputrec,
t_nrnb nrnb[],
- gmx_wallcycle_t wcycle,
+ gmx_wallcycle* wcycle,
gmx_walltime_accounting_t walltime_accounting,
nonbonded_verlet_t* nbv,
const gmx_pme_t* pme,
Further, we only report performance for dynamical integrators,
because those are the only ones for which we plan to
consider doing any optimizations. */
- bool printReport = EI_DYNAMICS(inputrec->eI) && SIMMASTER(cr);
+ bool printReport = EI_DYNAMICS(inputrec.eI) && SIMMASTER(cr);
if (printReport && !walltime_accounting_get_valid_finish(walltime_accounting))
{
nrnbTotalStorage = std::make_unique<t_nrnb>();
nrnb_tot = nrnbTotalStorage.get();
#if GMX_MPI
- MPI_Allreduce(nrnb->n, nrnb_tot->n, eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+ MPI_Allreduce(nrnb->n.data(), nrnb_tot->n.data(), eNRNB, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
#endif
}
else
{
#if GMX_MPI
/* reduce elapsed_time over all MPI ranks in the current simulation */
- MPI_Allreduce(&elapsed_time, &elapsed_time_over_all_ranks, 1, MPI_DOUBLE, MPI_SUM,
- cr->mpi_comm_mysim);
+ MPI_Allreduce(&elapsed_time, &elapsed_time_over_all_ranks, 1, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
elapsed_time_over_all_ranks /= cr->nnodes;
/* Reduce elapsed_time_over_all_threads over all MPI ranks in the
* current simulation. */
- MPI_Allreduce(&elapsed_time_over_all_threads, &elapsed_time_over_all_threads_over_all_ranks,
- 1, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+ MPI_Allreduce(&elapsed_time_over_all_threads,
+ &elapsed_time_over_all_threads_over_all_ranks,
+ 1,
+ MPI_DOUBLE,
+ MPI_SUM,
+ cr->mpi_comm_mysim);
#endif
}
else
print_flop(fplog, nrnb_tot, &nbfs, &mflop);
}
- if (thisRankHasDuty(cr, DUTY_PP) && DOMAINDECOMP(cr))
+ if (thisRankHasDuty(cr, DUTY_PP) && haveDDAtomOrdering(*cr))
{
print_dd_statistics(cr, inputrec, fplog);
}
* to the code that handled the thread region, so that there's a
* mechanism to keep cycle counting working during the transition
* to task parallelism. */
- int nthreads_pp = gmx_omp_nthreads_get(emntNonbonded);
- int nthreads_pme = gmx_omp_nthreads_get(emntPME);
- wallcycle_scale_by_num_threads(wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP),
- nthreads_pp, nthreads_pme);
+ int nthreads_pp = gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded);
+ int nthreads_pme = gmx_omp_nthreads_get(ModuleMultiThread::Pme);
+ wallcycle_scale_by_num_threads(
+ wcycle, thisRankHasDuty(cr, DUTY_PME) && !thisRankHasDuty(cr, DUTY_PP), nthreads_pp, nthreads_pme);
auto cycle_sum(wallcycle_sum(cr, wcycle));
if (printReport)
{
- auto nbnxn_gpu_timings =
+ auto* nbnxn_gpu_timings =
(nbv != nullptr && nbv->useGpu()) ? Nbnxm::gpu_get_timings(nbv->gpu_nbv) : nullptr;
gmx_wallclock_gpu_pme_t pme_gpu_timings = {};
{
pme_gpu_get_timings(pme, &pme_gpu_timings);
}
- wallcycle_print(fplog, mdlog, cr->nnodes, cr->npmenodes, nthreads_pp, nthreads_pme,
- elapsed_time_over_all_ranks, wcycle, cycle_sum, nbnxn_gpu_timings,
+ wallcycle_print(fplog,
+ mdlog,
+ cr->nnodes,
+ cr->npmenodes,
+ nthreads_pp,
+ nthreads_pme,
+ elapsed_time_over_all_ranks,
+ wcycle,
+ cycle_sum,
+ nbnxn_gpu_timings,
&pme_gpu_timings);
- if (EI_DYNAMICS(inputrec->eI))
+ if (EI_DYNAMICS(inputrec.eI))
{
- delta_t = inputrec->delta_t;
+ delta_t = inputrec.delta_t;
}
if (fplog)
{
- print_perf(fplog, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
+ print_perf(fplog,
+ elapsed_time_over_all_threads_over_all_ranks,
+ elapsed_time_over_all_ranks,
walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
- delta_t, nbfs, mflop);
+ delta_t,
+ nbfs,
+ mflop);
}
if (bWriteStat)
{
- print_perf(stderr, elapsed_time_over_all_threads_over_all_ranks, elapsed_time_over_all_ranks,
+ print_perf(stderr,
+ elapsed_time_over_all_threads_over_all_ranks,
+ elapsed_time_over_all_ranks,
walltime_accounting_get_nsteps_done_since_reset(walltime_accounting),
- delta_t, nbfs, mflop);
+ delta_t,
+ nbfs,
+ mflop);
}
}
}
int Mdrunner::mdrunner()
{
- matrix box;
- t_forcerec* fr = nullptr;
- t_fcdata* fcd = nullptr;
- real ewaldcoeff_q = 0;
- real ewaldcoeff_lj = 0;
- int nChargePerturbed = -1, nTypePerturbed = 0;
- gmx_wallcycle_t wcycle;
- gmx_walltime_accounting_t walltime_accounting = nullptr;
- gmx_membed_t* membed = nullptr;
- gmx_hw_info_t* hwinfo = nullptr;
+ matrix box;
+ std::unique_ptr<t_forcerec> fr;
+ real ewaldcoeff_q = 0;
+ real ewaldcoeff_lj = 0;
+ int nChargePerturbed = -1, nTypePerturbed = 0;
+ gmx_walltime_accounting_t walltime_accounting = nullptr;
+ MembedHolder membedHolder(filenames.size(), filenames.data());
/* CAUTION: threads may be started later on in this function, so
cr doesn't reflect the final parallel state right now */
/* TODO: inputrec should tell us whether we use an algorithm, not a file option */
const bool doEssentialDynamics = opt2bSet("-ei", filenames.size(), filenames.data());
- const bool doMembed = opt2bSet("-membed", filenames.size(), filenames.data());
const bool doRerun = mdrunOptions.rerun;
// Handle task-assignment related user options.
{
fplog = gmx_fio_getfp(logFileHandle);
}
- const bool isSimulationMasterRank = findIsSimulationMasterRank(ms, communicator);
+ const bool isSimulationMasterRank = findIsSimulationMasterRank(ms, simulationCommunicator);
gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
gmx::MDLogger mdlog(logOwner.logger());
- // TODO The thread-MPI master rank makes a working
- // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
- // after the threads have been launched. This works because no use
- // is made of that communicator until after the execution paths
- // have rejoined. But it is likely that we can improve the way
- // this is expressed, e.g. by expressly running detection only the
- // master rank for thread-MPI, rather than relying on the mutex
- // and reference count.
- PhysicalNodeCommunicator physicalNodeComm(communicator, gmx_physicalnode_id_hash());
- hwinfo = gmx_detect_hardware(mdlog, physicalNodeComm);
-
- gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo);
+ gmx_print_detected_hardware(fplog, isSimulationMasterRank && isMasterSim(ms), mdlog, hwinfo_);
- std::vector<int> gpuIdsToUse = makeGpuIdsToUse(hwinfo->gpu_info, hw_opt.gpuIdsAvailable);
+ std::vector<int> availableDevices =
+ makeListOfAvailableDevices(hwinfo_->deviceInfoList, hw_opt.devicesSelectedByUser);
+ const int numAvailableDevices = gmx::ssize(availableDevices);
// Print citation requests after all software/hardware printing
pleaseCiteGromacs(fplog);
- // TODO Replace this by unique_ptr once t_inputrec is C++
- t_inputrec inputrecInstance;
- t_inputrec* inputrec = nullptr;
- std::unique_ptr<t_state> globalState;
+ // Note: legacy program logic relies on checking whether these pointers are assigned.
+ // Objects may or may not be allocated later.
+ std::unique_ptr<t_inputrec> inputrec;
+ std::unique_ptr<t_state> globalState;
auto partialDeserializedTpr = std::make_unique<PartialDeserializedTprFile>();
if (isSimulationMasterRank)
{
+ // Allocate objects to be initialized by later function calls.
/* Only the master rank has the global state */
globalState = std::make_unique<t_state>();
+ inputrec = std::make_unique<t_inputrec>();
/* Read (nearly) all data required for the simulation
* and keep the partly serialized tpr contents to send to other ranks later
*/
- *partialDeserializedTpr = read_tpx_state(ftp2fn(efTPR, filenames.size(), filenames.data()),
- &inputrecInstance, globalState.get(), &mtop);
- inputrec = &inputrecInstance;
+ applyGlobalSimulationState(
+ *inputHolder_.get(), partialDeserializedTpr.get(), globalState.get(), inputrec.get(), &mtop);
}
/* Check and update the hardware options for internal consistency */
- checkAndUpdateHardwareOptions(mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks,
- inputrec);
+ checkAndUpdateHardwareOptions(
+ mdlog, &hw_opt, isSimulationMasterRank, domdecOptions.numPmeRanks, inputrec.get());
if (GMX_THREAD_MPI && isSimulationMasterRank)
{
// the number of GPUs to choose the number of ranks.
auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
useGpuForNonbonded = decideWhetherToUseGpusForNonbondedWithThreadMpi(
- nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment, emulateGpuNonbonded,
+ nonbondedTarget,
+ numAvailableDevices > 0,
+ userGpuTaskAssignment,
+ emulateGpuNonbonded,
canUseGpuForNonbonded,
gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, GMX_THREAD_MPI),
hw_opt.nthreads_tmpi);
- useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(
- useGpuForNonbonded, pmeTarget, gpuIdsToUse, userGpuTaskAssignment, *hwinfo,
- *inputrec, mtop, hw_opt.nthreads_tmpi, domdecOptions.numPmeRanks);
+ useGpuForPme = decideWhetherToUseGpusForPmeWithThreadMpi(useGpuForNonbonded,
+ pmeTarget,
+ pmeFftTarget,
+ numAvailableDevices,
+ userGpuTaskAssignment,
+ *hwinfo_,
+ *inputrec,
+ hw_opt.nthreads_tmpi,
+ domdecOptions.numPmeRanks);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
* TODO Over-writing the user-supplied value here does
* prevent any possible subsequent checks from working
* correctly. */
- hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo, &hw_opt, gpuIdsToUse, useGpuForNonbonded,
- useGpuForPme, inputrec, &mtop, mdlog, doMembed);
+ hw_opt.nthreads_tmpi = get_nthreads_mpi(hwinfo_,
+ &hw_opt,
+ numAvailableDevices,
+ useGpuForNonbonded,
+ useGpuForPme,
+ inputrec.get(),
+ mtop,
+ mdlog,
+ membedHolder.doMembed());
// Now start the threads for thread MPI.
spawnThreads(hw_opt.nthreads_tmpi);
// The spawned threads enter mdrunner() and execution of
// master and spawned threads joins at the end of this block.
- physicalNodeComm = PhysicalNodeCommunicator(communicator, gmx_physicalnode_id_hash());
}
- GMX_RELEASE_ASSERT(communicator == MPI_COMM_WORLD, "Must have valid world communicator");
- CommrecHandle crHandle = init_commrec(communicator, ms);
+ GMX_RELEASE_ASSERT(!GMX_MPI || ms || simulationCommunicator != MPI_COMM_NULL,
+ "Must have valid communicator unless running a multi-simulation");
+ CommrecHandle crHandle = init_commrec(simulationCommunicator);
t_commrec* cr = crHandle.get();
GMX_RELEASE_ASSERT(cr != nullptr, "Must have valid commrec");
+ PhysicalNodeCommunicator physicalNodeComm(libraryWorldCommunicator, gmx_physicalnode_id_hash());
+
+ // If we detected the topology on this system, double-check that it makes sense
+ if (hwinfo_->hardwareTopology->isThisSystem())
+ {
+ hardwareTopologyDoubleCheckDetection(mdlog, *hwinfo_->hardwareTopology);
+ }
+
if (PAR(cr))
{
/* now broadcast everything to the non-master nodes/threads: */
if (!isSimulationMasterRank)
{
- inputrec = &inputrecInstance;
+ // Until now, only the master rank has a non-null pointer.
+ // On non-master ranks, allocate the object that will receive data in the following call.
+ inputrec = std::make_unique<t_inputrec>();
}
- init_parallel(cr, inputrec, &mtop, partialDeserializedTpr.get());
+ init_parallel(cr->mpiDefaultCommunicator,
+ MASTER(cr),
+ inputrec.get(),
+ &mtop,
+ partialDeserializedTpr.get());
}
GMX_RELEASE_ASSERT(inputrec != nullptr, "All ranks should have a valid inputrec now");
partialDeserializedTpr.reset(nullptr);
// the inputrec read by the master rank. The ranks can now all run
// the task-deciding functions and will agree on the result
// without needing to communicate.
- //
- // TODO Should we do the communication in debug mode to support
- // having an assertion?
- const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
+ // The LBFGS minimizer, test-particle insertion, normal modes and shell dynamics don't support DD
+ const bool useDomainDecomposition =
+ !(inputrec->eI == IntegrationAlgorithm::LBFGS || EI_TPI(inputrec->eI)
+ || inputrec->eI == IntegrationAlgorithm::NM
+ || gmx_mtop_particletype_count(mtop)[ParticleType::Shell] > 0);
// Note that these variables describe only their own node.
//
bool useGpuForPme = false;
bool useGpuForBonded = false;
bool useGpuForUpdate = false;
- bool gpusWereDetected = hwinfo->ngpu_compatible_tot > 0;
+ bool gpusWereDetected = hwinfo_->ngpu_compatible_tot > 0;
try
{
// It's possible that there are different numbers of GPUs on
// assignment.
auto canUseGpuForNonbonded = buildSupportsNonbondedOnGpu(nullptr);
useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(
- nonbondedTarget, userGpuTaskAssignment, emulateGpuNonbonded, canUseGpuForNonbonded,
- gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI), gpusWereDetected);
- useGpuForPme = decideWhetherToUseGpusForPme(
- useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, *hwinfo, *inputrec, mtop,
- cr->nnodes, domdecOptions.numPmeRanks, gpusWereDetected);
- auto canUseGpuForBonded = buildSupportsGpuBondeds(nullptr)
- && inputSupportsGpuBondeds(*inputrec, mtop, nullptr);
+ nonbondedTarget,
+ userGpuTaskAssignment,
+ emulateGpuNonbonded,
+ canUseGpuForNonbonded,
+ gpuAccelerationOfNonbondedIsUseful(mdlog, *inputrec, !GMX_THREAD_MPI),
+ gpusWereDetected);
+ useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded,
+ pmeTarget,
+ pmeFftTarget,
+ userGpuTaskAssignment,
+ *hwinfo_,
+ *inputrec,
+ cr->sizeOfDefaultCommunicator,
+ domdecOptions.numPmeRanks,
+ gpusWereDetected);
useGpuForBonded = decideWhetherToUseGpusForBonded(
- useGpuForNonbonded, useGpuForPme, bondedTarget, canUseGpuForBonded,
- EVDW_PME(inputrec->vdwtype), EEL_PME_EWALD(inputrec->coulombtype),
- domdecOptions.numPmeRanks, gpusWereDetected);
+ useGpuForNonbonded, useGpuForPme, bondedTarget, *inputrec, mtop, domdecOptions.numPmeRanks, gpusWereDetected);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
const DevelopmentFeatureFlags devFlags =
manageDevelopmentFeatures(mdlog, useGpuForNonbonded, pmeRunMode);
- const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
- false, inputrec, doRerun, mtop, ms, replExParams, nullptr, doEssentialDynamics, doMembed);
- const bool useModularSimulator = inputIsCompatibleWithModularSimulator
- && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
+ const bool useModularSimulator = checkUseModularSimulator(false,
+ inputrec.get(),
+ doRerun,
+ mtop,
+ ms,
+ replExParams,
+ nullptr,
+ doEssentialDynamics,
+ membedHolder.doMembed());
+
+ ObservablesReducerBuilder observablesReducerBuilder;
// Build restraints.
// TODO: hide restraint implementation details from Mdrunner.
// TODO: Error handling
mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
- const auto& mdModulesNotifier = mdModules_->notifier().notifier_;
+ // now that the MDModules know their options, they know which callbacks to sign up to
+ mdModules_->subscribeToSimulationSetupNotifications();
+ const auto& setupNotifier = mdModules_->notifiers().simulationSetupNotifier_;
+
+ // Notify MdModules of existing logger
+ setupNotifier.notify(mdlog);
+ // Notify MdModules of internal parameters, saved into KVT
if (inputrec->internalParameters != nullptr)
{
- mdModulesNotifier.notify(*inputrec->internalParameters);
+ setupNotifier.notify(*inputrec->internalParameters);
+ }
+
+ // Let MdModules know the .tpr filename
+ {
+ gmx::MdRunInputFilename mdRunInputFilename = { ftp2fn(efTPR, filenames.size(), filenames.data()) };
+ setupNotifier.notify(mdRunInputFilename);
}
if (fplog != nullptr)
{
- pr_inputrec(fplog, 0, "Input Parameters", inputrec, FALSE);
+ pr_inputrec(fplog, 0, "Input Parameters", inputrec.get(), FALSE);
fprintf(fplog, "\n");
}
if (SIMMASTER(cr))
{
/* In rerun, set velocities to zero if present */
- if (doRerun && ((globalState->flags & (1 << estV)) != 0))
+ if (doRerun && ((globalState->flags & enumValueToBitMask(StateEntry::V)) != 0))
{
// rerun does not use velocities
GMX_LOG(mdlog.info)
{
clear_rvec(globalState->v[i]);
}
- globalState->flags &= ~(1 << estV);
+ globalState->flags &= ~enumValueToBitMask(StateEntry::V);
}
/* now make sure the state is initialized and propagated */
- set_state_entries(globalState.get(), inputrec, useModularSimulator);
+ set_state_entries(globalState.get(), inputrec.get(), useModularSimulator);
}
/* NM and TPI parallelize over force/energy calculations, not atoms,
* so we need to initialize and broadcast the global state.
*/
- if (inputrec->eI == eiNM || inputrec->eI == eiTPI)
+ if (inputrec->eI == IntegrationAlgorithm::NM || inputrec->eI == IntegrationAlgorithm::TPI)
{
if (!MASTER(cr))
{
globalState = std::make_unique<t_state>();
}
- broadcastStateWithoutDynamics(cr, globalState.get());
+ broadcastStateWithoutDynamics(
+ cr->mpiDefaultCommunicator, haveDDAtomOrdering(*cr), PAR(cr), globalState.get());
}
/* A parallel command line option consistency check that we can
#endif
}
- if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || eiNM == inputrec->eI))
+ if (doRerun && (EI_ENERGY_MINIMIZATION(inputrec->eI) || IntegrationAlgorithm::NM == inputrec->eI))
{
gmx_fatal(FARGS,
"The .mdp file specified an energy mininization or normal mode algorithm, and "
"these are not compatible with mdrun -rerun");
}
- if (!(EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype)))
- {
- if (domdecOptions.numPmeRanks > 0)
- {
- gmx_fatal_collective(FARGS, cr->mpi_comm_mysim, MASTER(cr),
- "PME-only ranks are requested, but the system does not use PME "
- "for electrostatics or LJ");
- }
-
- domdecOptions.numPmeRanks = 0;
- }
-
- if (useGpuForNonbonded && domdecOptions.numPmeRanks < 0)
- {
- /* With NB GPUs we don't automatically use PME-only CPU ranks. PME ranks can
- * improve performance with many threads per GPU, since our OpenMP
- * scaling is bad, but it's difficult to automate the setup.
- */
- domdecOptions.numPmeRanks = 0;
- }
- if (useGpuForPme)
- {
- if (domdecOptions.numPmeRanks < 0)
- {
- domdecOptions.numPmeRanks = 0;
- // TODO possibly print a note that one can opt-in for a separate PME GPU rank?
- }
- else
- {
- GMX_RELEASE_ASSERT(domdecOptions.numPmeRanks <= 1,
- "PME GPU decomposition is not supported");
- }
- }
-
/* NMR restraints must be initialized before load_checkpoint,
* since with time averaging the history is added to t_state.
* For proper consistency check we therefore need to extend
* So the PME-only nodes (if present) will also initialize
* the distance restraints.
*/
- snew(fcd, 1);
/* This needs to be called before read_checkpoint to extend the state */
- init_disres(fplog, &mtop, inputrec, cr, ms, fcd, globalState.get(), replExParams.exchangeInterval > 0);
-
- init_orires(fplog, &mtop, inputrec, cr, ms, globalState.get(), &(fcd->orires));
-
- auto deform = prepareBoxDeformation(globalState->box, cr, *inputrec);
+ t_disresdata* disresdata;
+ snew(disresdata, 1);
+ init_disres(fplog,
+ mtop,
+ inputrec.get(),
+ DisResRunMode::MDRun,
+ MASTER(cr) ? DDRole::Master : DDRole::Agent,
+ PAR(cr) ? NumRanks::Multiple : NumRanks::Single,
+ cr->mpi_comm_mysim,
+ ms,
+ disresdata,
+ globalState.get(),
+ replExParams.exchangeInterval > 0);
+
+ if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0 && isSimulationMasterRank)
+ {
+ extendStateWithOriresHistory(mtop, *inputrec, globalState.get());
+ }
+
+ auto deform = prepareBoxDeformation(globalState != nullptr ? globalState->box : box,
+ MASTER(cr) ? DDRole::Master : DDRole::Agent,
+ PAR(cr) ? NumRanks::Multiple : NumRanks::Single,
+ cr->mpi_comm_mygroup,
+ *inputrec);
#if GMX_FAHCORE
/* We have to remember the generation's first step before reading checkpoint.
ObservablesHistory observablesHistory = {};
+ auto modularSimulatorCheckpointData = std::make_unique<ReadCheckpointDataHolder>();
if (startingBehavior != StartingBehavior::NewSimulation)
{
/* Check if checkpoint file exists before doing continuation.
inputrec->nsteps = -1;
}
- load_checkpoint(opt2fn_master("-cpi", filenames.size(), filenames.data(), cr),
- logFileHandle, cr, domdecOptions.numCells, inputrec, globalState.get(),
- &observablesHistory, mdrunOptions.reproducible, mdModules_->notifier());
+ // Finish applying initial simulation state information from external sources on all ranks.
+ // Reconcile checkpoint file data with Mdrunner state established up to this point.
+ applyLocalState(*inputHolder_.get(),
+ logFileHandle,
+ cr,
+ domdecOptions.numCells,
+ inputrec.get(),
+ globalState.get(),
+ &observablesHistory,
+ mdrunOptions.reproducible,
+ mdModules_->notifiers(),
+ modularSimulatorCheckpointData.get(),
+ useModularSimulator);
+ // TODO: (#3652) Synchronize filesystem state, SimulationInput contents, and program
+ // invariants
+ // on all code paths.
+ // Write checkpoint or provide hook to update SimulationInput.
+ // If there was a checkpoint file, SimulationInput contains more information
+ // than if there wasn't. At this point, we have synchronized the in-memory
+ // state with the filesystem state only for restarted simulations. We should
+ // be calling applyLocalState unconditionally and expect that the completeness
+ // of SimulationInput is not dependent on its creation method.
if (startingBehavior == StartingBehavior::RestartWithAppending && logFileHandle)
{
"file field.");
}
/* override nsteps with value set on the commandline */
- override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec);
+ override_nsteps_cmdline(mdlog, mdrunOptions.numStepsCommandline, inputrec.get());
- if (SIMMASTER(cr))
+ if (isSimulationMasterRank)
{
copy_mat(globalState->box, box);
}
if (PAR(cr))
{
- gmx_bcast(sizeof(box), box, cr);
+ gmx_bcast(sizeof(box), box, cr->mpiDefaultCommunicator);
}
- if (inputrec->cutoff_scheme != ecutsVERLET)
+ if (inputrec->cutoff_scheme != CutoffScheme::Verlet)
{
gmx_fatal(FARGS,
"This group-scheme .tpr file can no longer be run by mdrun. Please update to the "
"Verlet scheme, or use an earlier version of GROMACS if necessary.");
}
/* Update rlist and nstlist. */
- prepare_verlet_scheme(fplog, cr, inputrec, nstlist_cmdline, &mtop, box,
+ /* Note: prepare_verlet_scheme is calling increaseNstlist(...), which (while attempting to
+ * increase rlist) tries to check if the newly chosen value fits with the DD scheme. As this is
+ * run before any DD scheme is set up, this check is never executed. See #3334 for more details.
+ */
+ prepare_verlet_scheme(fplog,
+ cr,
+ inputrec.get(),
+ nstlist_cmdline,
+ mtop,
+ box,
useGpuForNonbonded || (emulateGpuNonbonded == EmulateGpuNonbonded::Yes),
- *hwinfo->cpuInfo);
+ *hwinfo_->cpuInfo);
+
+ // We need to decide on update groups early, as this affects
+ // inter-domain communication distances.
+ auto updateGroupingsPerMoleculeType = makeUpdateGroupingsPerMoleculeType(mtop);
+ const real maxUpdateGroupRadius = computeMaxUpdateGroupRadius(
+ mtop, updateGroupingsPerMoleculeType, maxReferenceTemperature(*inputrec));
+ const real cutoffMargin = std::sqrt(max_cutoff2(inputrec->pbcType, box)) - inputrec->rlist;
+ UpdateGroups updateGroups = makeUpdateGroups(mdlog,
+ std::move(updateGroupingsPerMoleculeType),
+ maxUpdateGroupRadius,
+ useDomainDecomposition,
+ systemHasConstraintsOrVsites(mtop),
+ cutoffMargin);
+
+ try
+ {
+ const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
+
+ useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
+ updateGroups.useUpdateGroups(),
+ pmeRunMode,
+ domdecOptions.numPmeRanks > 0,
+ useGpuForNonbonded,
+ updateTarget,
+ gpusWereDetected,
+ *inputrec,
+ mtop,
+ doEssentialDynamics,
+ gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+ haveFrozenAtoms,
+ doRerun,
+ devFlags,
+ mdlog);
+ }
+ GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+
+ bool useGpuDirectHalo = false;
+
+ if (useGpuForNonbonded)
+ {
+ // cr->npmenodes is not yet initialized.
+ // domdecOptions.numPmeRanks == -1 results in 0 separate PME ranks when useGpuForNonbonded is true.
+ // Todo: remove this assumption later once auto mode has support for separate PME rank
+ const int numPmeRanks = domdecOptions.numPmeRanks > 0 ? domdecOptions.numPmeRanks : 0;
+ bool havePPDomainDecomposition = (cr->sizeOfDefaultCommunicator - numPmeRanks) > 1;
+ useGpuDirectHalo = decideWhetherToUseGpuForHalo(devFlags,
+ havePPDomainDecomposition,
+ useGpuForNonbonded,
+ useModularSimulator,
+ doRerun,
+ EI_ENERGY_MINIMIZATION(inputrec->eI));
+ }
- const bool prefer1DAnd1PulseDD = (devFlags.enableGpuHaloExchange && useGpuForNonbonded);
// This builder is necessary while we have multi-part construction
// of DD. Before DD is constructed, we use the existence of
// the builder object to indicate that further construction of DD
std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
if (useDomainDecomposition)
{
- ddBuilder = std::make_unique<DomainDecompositionBuilder>(
- mdlog, cr, domdecOptions, mdrunOptions, prefer1DAnd1PulseDD, mtop, *inputrec, box,
- positionsFromStatePointer(globalState.get()));
+ // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple
+ // timesteps. DLB needs to be disabled in that case
+ const bool directGpuCommUsedWithGpuUpdate = GMX_THREAD_MPI && useGpuDirectHalo && useGpuForUpdate;
+ ddBuilder = std::make_unique<DomainDecompositionBuilder>(
+ mdlog,
+ cr,
+ domdecOptions,
+ mdrunOptions,
+ mtop,
+ *inputrec,
+ mdModules_->notifiers(),
+ box,
+ updateGroups.updateGroupingPerMoleculeType(),
+ updateGroups.useUpdateGroups(),
+ updateGroups.maxUpdateGroupRadius(),
+ positionsFromStatePointer(globalState.get()),
+ useGpuForNonbonded,
+ useGpuForPme,
+ directGpuCommUsedWithGpuUpdate);
}
else
{
/* PME, if used, is done on all nodes with 1D decomposition */
- cr->npmenodes = 0;
- cr->duty = (DUTY_PP | DUTY_PME);
+ cr->nnodes = cr->sizeOfDefaultCommunicator;
+ cr->sim_nodeid = cr->rankInDefaultCommunicator;
+ cr->nodeid = cr->rankInDefaultCommunicator;
+ cr->npmenodes = 0;
+ cr->duty = (DUTY_PP | DUTY_PME);
- if (inputrec->ePBC == epbcSCREW)
+ if (inputrec->pbcType == PbcType::Screw)
{
gmx_fatal(FARGS, "pbc=screw is only implemented with domain decomposition");
}
}
- // Produce the task assignment for this rank.
- GpuTaskAssignmentsBuilder gpuTaskAssignmentsBuilder;
- GpuTaskAssignments gpuTaskAssignments = gpuTaskAssignmentsBuilder.build(
- gpuIdsToUse, userGpuTaskAssignment, *hwinfo, communicator, physicalNodeComm,
- nonbondedTarget, pmeTarget, bondedTarget, updateTarget, useGpuForNonbonded,
- useGpuForPme, thisRankHasDuty(cr, DUTY_PP),
+ // Produce the task assignment for this rank - done after DD is constructed
+ GpuTaskAssignments gpuTaskAssignments = GpuTaskAssignmentsBuilder::build(
+ availableDevices,
+ userGpuTaskAssignment,
+ *hwinfo_,
+ simulationCommunicator,
+ physicalNodeComm,
+ nonbondedTarget,
+ pmeTarget,
+ bondedTarget,
+ updateTarget,
+ useGpuForNonbonded,
+ useGpuForPme,
+ thisRankHasDuty(cr, DUTY_PP),
// TODO cr->duty & DUTY_PME should imply that a PME
// algorithm is active, but currently does not.
EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
// Get the device handles for the modules, nullptr when no task is assigned.
- gmx_device_info_t* nonbondedDeviceInfo = gpuTaskAssignments.initNonbondedDevice(cr);
- gmx_device_info_t* pmeDeviceInfo = gpuTaskAssignments.initPmeDevice();
+ int deviceId = -1;
+ DeviceInformation* deviceInfo = gpuTaskAssignments.initDevice(&deviceId);
- // TODO Initialize GPU streams here.
+ // timing enabling - TODO put this in gpu_utils (even though generally this is just option handling?)
+ bool useTiming = true;
+
+ if (GMX_GPU_CUDA)
+ {
+ /* WARNING: CUDA timings are incorrect with multiple streams.
+ * This is the main reason why they are disabled by default.
+ */
+ // TODO: Consider turning on by default when we can detect nr of streams.
+ useTiming = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
+ }
+ else if (GMX_GPU_OPENCL)
+ {
+ useTiming = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
+ }
// TODO Currently this is always built, yet DD partition code
// checks if it is built before using it. Probably it should
// requires it (e.g. pull, CompEl, density fitting), so that we
// don't update the local atom sets unilaterally every step.
LocalAtomSetManager atomSets;
+
+ // Local state and topology are declared (and perhaps constructed)
+ // now, because DD needs them for the LocalTopologyChecker, but
+ // they do not contain valid data until after the first DD
+ // partition.
+ std::unique_ptr<t_state> localStateInstance;
+ t_state* localState;
+ gmx_localtop_t localTopology(mtop.ffparams);
+
if (ddBuilder)
{
+ localStateInstance = std::make_unique<t_state>();
+ localState = localStateInstance.get();
// TODO Pass the GPU streams to ddBuilder to use in buffer
// transfers (e.g. halo exchange)
- cr->dd = ddBuilder->build(&atomSets);
+ cr->dd = ddBuilder->build(&atomSets, localTopology, *localState, &observablesReducerBuilder);
// The builder's job is done, so destruct it
ddBuilder.reset(nullptr);
// Note that local state still does not exist yet.
}
+ else
+ {
+ // Without DD, the local state is merely an alias to the global state,
+ // so we don't need to allocate anything.
+ localState = globalState.get();
+ }
- // The GPU update is decided here because we need to know whether the constraints or
- // SETTLEs can span accross the domain borders (i.e. whether or not update groups are
- // defined). This is only known after DD is initialized, hence decision on using GPU
- // update is done so late.
- try
+ // Ensure that all atoms within the same update group are in the
+ // same periodic image. Otherwise, a simulation that did not use
+ // update groups (e.g. a single-rank simulation) cannot always be
+ // correctly restarted in a way that does use update groups
+ // (e.g. a multi-rank simulation).
+ if (isSimulationMasterRank)
{
const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
-
- useGpuForUpdate = decideWhetherToUseGpuForUpdate(
- useDomainDecomposition, useUpdateGroups, pmeRunMode, domdecOptions.numPmeRanks > 0,
- useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop,
- doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
- replExParams.exchangeInterval > 0, doRerun, devFlags, mdlog);
+ if (useUpdateGroups)
+ {
+ putUpdateGroupAtomsInSamePeriodicImage(*cr->dd, mtop, globalState->box, globalState->x);
+ }
}
- GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
const bool printHostName = (cr->nnodes > 1);
gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
+ const bool disableNonbondedCalculation = (getenv("GMX_NO_NONBONDED") != nullptr);
+ if (disableNonbondedCalculation)
+ {
+ /* turn off non-bonded calculations */
+ GMX_LOG(mdlog.warning)
+ .asParagraph()
+ .appendText(
+ "Found environment variable GMX_NO_NONBONDED.\n"
+ "Disabling nonbonded calculations.");
+ }
+
+ MdrunScheduleWorkload runScheduleWork;
+
+ // Also populates the simulation constant workload description.
+ // Note: currently the default duty is DUTY_PP | DUTY_PME for all simulations, including those without PME,
+ // so this boolean is sufficient on all ranks to determine whether separate PME ranks are used,
+ // but this will no longer be the case if cr->duty is changed for !EEL_PME(fr->ic->eeltype).
+ const bool haveSeparatePmeRank = (!thisRankHasDuty(cr, DUTY_PP) || !thisRankHasDuty(cr, DUTY_PME));
+ runScheduleWork.simulationWork = createSimulationWorkload(*inputrec,
+ disableNonbondedCalculation,
+ devFlags,
+ havePPDomainDecomposition(cr),
+ haveSeparatePmeRank,
+ useGpuForNonbonded,
+ pmeRunMode,
+ useGpuForBonded,
+ useGpuForUpdate,
+ useGpuDirectHalo);
+
+ std::unique_ptr<DeviceStreamManager> deviceStreamManager = nullptr;
+
+ if (deviceInfo != nullptr)
+ {
+ if (runScheduleWork.simulationWork.havePpDomainDecomposition && thisRankHasDuty(cr, DUTY_PP))
+ {
+ dd_setup_dlb_resource_sharing(cr, deviceId);
+ }
+ deviceStreamManager = std::make_unique<DeviceStreamManager>(
+ *deviceInfo, havePPDomainDecomposition(cr), runScheduleWork.simulationWork, useTiming);
+ }
+
// If the user chose a task assignment, give them some hints
// where appropriate.
if (!userGpuTaskAssignment.empty())
{
- gpuTaskAssignments.logPerformanceHints(mdlog, ssize(gpuIdsToUse));
+ gpuTaskAssignments.logPerformanceHints(mdlog, numAvailableDevices);
}
if (PAR(cr))
.appendTextFormatted(
"This is simulation %d out of %d running as a composite GROMACS\n"
"multi-simulation job. Setup for this simulation:\n",
- ms->sim, ms->nsim);
+ ms->simulationIndex_,
+ ms->numSimulations_);
}
GMX_LOG(mdlog.warning)
- .appendTextFormatted("Using %d MPI %s\n", cr->nnodes,
+ .appendTextFormatted("Using %d MPI %s\n",
+ cr->nnodes,
# if GMX_THREAD_MPI
cr->nnodes == 1 ? "thread" : "threads"
# else
// that existing affinity setting was from OpenMP or something
// else, so we run this code both before and after we initialize
// the OpenMP support.
- gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo->nthreads_hw_avail, FALSE);
+ gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, FALSE);
/* Check and update the number of OpenMP threads requested */
- checkAndUpdateRequestedNumOpenmpThreads(&hw_opt, *hwinfo, cr, ms, physicalNodeComm.size_,
- pmeRunMode, mtop, *inputrec);
-
- gmx_omp_nthreads_init(mdlog, cr, hwinfo->nthreads_hw_avail, physicalNodeComm.size_,
- hw_opt.nthreads_omp, hw_opt.nthreads_omp_pme, !thisRankHasDuty(cr, DUTY_PP));
-
- // Enable FP exception detection, but not in
- // Release mode and not for compilers with known buggy FP
- // exception support (clang with any optimization) or suspected
- // buggy FP exception support (gcc 7.* with optimization).
-#if !defined NDEBUG \
- && !((defined __clang__ || (defined(__GNUC__) && !defined(__ICC) && __GNUC__ == 7)) \
- && defined __OPTIMIZE__)
- const bool bEnableFPE = true;
-#else
- const bool bEnableFPE = false;
-#endif
+ checkAndUpdateRequestedNumOpenmpThreads(
+ &hw_opt, *hwinfo_, cr, ms, physicalNodeComm.size_, pmeRunMode, mtop, *inputrec);
+
+ gmx_omp_nthreads_init(mdlog,
+ cr,
+ hwinfo_->nthreads_hw_avail,
+ physicalNodeComm.size_,
+ hw_opt.nthreads_omp,
+ hw_opt.nthreads_omp_pme,
+ !thisRankHasDuty(cr, DUTY_PP));
+
+ const bool bEnableFPE = gmxShouldEnableFPExceptions();
// FIXME - reconcile with gmx_feenableexcept() call from CommandLineModuleManager::run()
if (bEnableFPE)
{
}
/* Now that we know the setup is consistent, check for efficiency */
- check_resource_division_efficiency(hwinfo, gpuTaskAssignments.thisRankHasAnyGpuTask(),
- mdrunOptions.ntompOptionIsSet, cr, mdlog);
+ check_resource_division_efficiency(
+ hwinfo_, gpuTaskAssignments.thisRankHasAnyGpuTask(), mdrunOptions.ntompOptionIsSet, cr, mdlog);
/* getting number of PP/PME threads on this MPI / tMPI rank.
PME: env variable should be read only on one node to make sure it is
identical everywhere;
*/
- const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP) ? gmx_omp_nthreads_get(emntNonbonded)
- : gmx_omp_nthreads_get(emntPME);
- checkHardwareOversubscription(numThreadsOnThisRank, cr->nodeid, *hwinfo->hardwareTopology,
- physicalNodeComm, mdlog);
+ const int numThreadsOnThisRank = thisRankHasDuty(cr, DUTY_PP)
+ ? gmx_omp_nthreads_get(ModuleMultiThread::Nonbonded)
+ : gmx_omp_nthreads_get(ModuleMultiThread::Pme);
+ checkHardwareOversubscription(
+ numThreadsOnThisRank, cr->nodeid, *hwinfo_->hardwareTopology, physicalNodeComm, mdlog);
// Enable Peer access between GPUs where available
// Only for DD, only master PP rank needs to perform setup, and only if thread MPI plus
// any of the GPU communication features are active.
- if (DOMAINDECOMP(cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
- && (devFlags.enableGpuHaloExchange || devFlags.enableGpuPmePPComm))
+ if (haveDDAtomOrdering(*cr) && MASTER(cr) && thisRankHasDuty(cr, DUTY_PP) && GMX_THREAD_MPI
+ && (runScheduleWork.simulationWork.useGpuHaloExchange
+ || runScheduleWork.simulationWork.useGpuPmePpCommunication))
{
- setupGpuDevicePeerAccess(gpuIdsToUse, mdlog);
+ setupGpuDevicePeerAccess(gpuTaskAssignments.deviceIdsAssigned(), mdlog);
}
if (hw_opt.threadAffinity != ThreadAffinity::Off)
* - which indicates that probably the OpenMP library has changed it
* since we first checked).
*/
- gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo->nthreads_hw_avail, TRUE);
+ gmx_check_thread_affinity_set(mdlog, &hw_opt, hwinfo_->nthreads_hw_avail, TRUE);
int numThreadsOnThisNode, intraNodeThreadOffset;
- analyzeThreadsOnThisNode(physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode,
- &intraNodeThreadOffset);
+ analyzeThreadsOnThisNode(
+ physicalNodeComm, numThreadsOnThisRank, &numThreadsOnThisNode, &intraNodeThreadOffset);
/* Set the CPU affinity */
- gmx_set_thread_affinity(mdlog, cr, &hw_opt, *hwinfo->hardwareTopology, numThreadsOnThisRank,
- numThreadsOnThisNode, intraNodeThreadOffset, nullptr);
+ gmx_set_thread_affinity(mdlog,
+ cr,
+ &hw_opt,
+ *hwinfo_->hardwareTopology,
+ numThreadsOnThisRank,
+ numThreadsOnThisNode,
+ intraNodeThreadOffset,
+ nullptr);
}
if (mdrunOptions.timingOptions.resetStep > -1)
"The -resetstep functionality is deprecated, and may be removed in a "
"future version.");
}
- wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+ std::unique_ptr<gmx_wallcycle> wcycle =
+ wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
if (PAR(cr))
{
/* Master synchronizes its value of reset_counters with all nodes
* including PME only nodes */
- int64_t reset_counters = wcycle_get_reset_counters(wcycle);
- gmx_bcast_sim(sizeof(reset_counters), &reset_counters, cr);
- wcycle_set_reset_counters(wcycle, reset_counters);
+ int64_t reset_counters = wcycle_get_reset_counters(wcycle.get());
+ gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim);
+ wcycle_set_reset_counters(wcycle.get(), reset_counters);
}
// Membrane embedding must be initialized before we call init_forcerec()
- if (doMembed)
- {
- if (MASTER(cr))
- {
- fprintf(stderr, "Initializing membed");
- }
- /* Note that membed cannot work in parallel because mtop is
- * changed here. Fix this if we ever want to make it run with
- * multiple ranks. */
- membed = init_membed(fplog, filenames.size(), filenames.data(), &mtop, inputrec,
- globalState.get(), cr, &mdrunOptions.checkpointOptions.period);
- }
-
- const bool thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
- std::unique_ptr<MDAtoms> mdAtoms;
- std::unique_ptr<gmx_vsite_t> vsite;
+ membedHolder.initializeMembed(fplog,
+ filenames.size(),
+ filenames.data(),
+ &mtop,
+ inputrec.get(),
+ globalState.get(),
+ cr,
+ &mdrunOptions.checkpointOptions.period);
+
+ const bool thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
+ std::unique_ptr<MDAtoms> mdAtoms;
+ std::unique_ptr<VirtualSitesHandler> vsite;
t_nrnb nrnb;
if (thisRankHasDuty(cr, DUTY_PP))
{
- mdModulesNotifier.notify(*cr);
- mdModulesNotifier.notify(&atomSets);
- mdModulesNotifier.notify(PeriodicBoundaryConditionType{ inputrec->ePBC });
- mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
+ setupNotifier.notify(*cr);
+ setupNotifier.notify(&atomSets);
+ setupNotifier.notify(mtop);
+ setupNotifier.notify(inputrec->pbcType);
+ setupNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
/* Initiate forcerecord */
- fr = new t_forcerec;
+ fr = std::make_unique<t_forcerec>();
fr->forceProviders = mdModules_->initForceProviders();
- init_forcerec(fplog, mdlog, fr, fcd, inputrec, &mtop, cr, box,
+ init_forcerec(fplog,
+ mdlog,
+ runScheduleWork.simulationWork,
+ fr.get(),
+ *inputrec,
+ mtop,
+ cr,
+ box,
opt2fn("-table", filenames.size(), filenames.data()),
opt2fn("-tablep", filenames.size(), filenames.data()),
- opt2fns("-tableb", filenames.size(), filenames.data()), *hwinfo,
- nonbondedDeviceInfo, useGpuForBonded,
- pmeRunMode == PmeRunMode::GPU && !thisRankHasDuty(cr, DUTY_PME), pforce, wcycle);
-
- // TODO Move this to happen during domain decomposition setup,
- // once stream and event handling works well with that.
- // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
- if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
+ opt2fns("-tableb", filenames.size(), filenames.data()),
+ pforce);
+ // Dirty hack, for fixing disres and orires should be made mdmodules
+ fr->fcdata->disres = disresdata;
+ if (gmx_mtop_ftype_count(mtop, F_ORIRES) > 0)
{
- GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps,
- "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
- void* streamLocal =
- Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
- void* streamNonLocal =
- Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
- GMX_LOG(mdlog.warning)
- .asParagraph()
- .appendTextFormatted(
- "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
- "GMX_GPU_DD_COMMS environment variable.");
- cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(
- cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal);
+ fr->fcdata->orires = std::make_unique<t_oriresdata>(
+ fplog, mtop, *inputrec, ms, globalState.get(), &atomSets);
+ }
+
+ // Save a handle to device stream manager to use elsewhere in the code
+ // TODO: Forcerec is not a correct place to store it.
+ fr->deviceStreamManager = deviceStreamManager.get();
+
+ if (runScheduleWork.simulationWork.useGpuPmePpCommunication && !thisRankHasDuty(cr, DUTY_PME))
+ {
+ GMX_RELEASE_ASSERT(
+ deviceStreamManager != nullptr,
+ "GPU device stream manager should be valid in order to use PME-PP direct "
+ "communications.");
+ GMX_RELEASE_ASSERT(
+ deviceStreamManager->streamIsValid(DeviceStreamType::PmePpTransfer),
+ "GPU PP-PME stream should be valid in order to use GPU PME-PP direct "
+ "communications.");
+ fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(
+ cr->mpi_comm_mysim,
+ cr->dd->pme_nodeid,
+ &cr->dd->pmeForceReceiveBuffer,
+ deviceStreamManager->context(),
+ deviceStreamManager->stream(DeviceStreamType::PmePpTransfer));
}
+ fr->nbv = Nbnxm::init_nb_verlet(mdlog,
+ *inputrec,
+ *fr,
+ cr,
+ *hwinfo_,
+ runScheduleWork.simulationWork.useGpuNonbonded,
+ deviceStreamManager.get(),
+ mtop,
+ box,
+ wcycle.get());
+ // TODO: Move the logic below to a GPU bonded builder
+ if (runScheduleWork.simulationWork.useGpuBonded)
+ {
+ GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+ "GPU device stream manager should be valid in order to use GPU "
+ "version of bonded forces.");
+ fr->listedForcesGpu = std::make_unique<ListedForcesGpu>(
+ mtop.ffparams,
+ fr->ic->epsfac * fr->fudgeQQ,
+ deviceStreamManager->context(),
+ deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)),
+ wcycle.get());
+ }
+ fr->longRangeNonbondeds = std::make_unique<CpuPpLongRangeNonbondeds>(fr->n_tpi,
+ fr->ic->ewaldcoeff_q,
+ fr->ic->epsilon_r,
+ fr->qsum,
+ fr->ic->eeltype,
+ fr->ic->vdwtype,
+ *inputrec,
+ &nrnb,
+ wcycle.get(),
+ fplog);
+
/* Initialize the mdAtoms structure.
* mdAtoms is not filled with atom data,
* as this can not be done now with domain decomposition.
}
/* Initialize the virtual site communication */
- vsite = initVsite(mtop, cr);
+ vsite = makeVirtualSitesHandler(
+ mtop, cr, fr->pbcType, updateGroups.updateGroupingPerMoleculeType());
calc_shifts(box, fr->shift_vec);
/* With periodic molecules the charge groups should be whole at start up
* and the virtual sites should not be far from their proper positions.
*/
- if (!inputrec->bContinuation && MASTER(cr) && !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+ if (!inputrec->bContinuation && MASTER(cr)
+ && !(inputrec->pbcType != PbcType::No && inputrec->bPeriodicMols))
{
/* Make molecules whole at start of run */
- if (fr->ePBC != epbcNONE)
+ if (fr->pbcType != PbcType::No)
{
- do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
+ do_pbc_first_mtop(fplog, inputrec->pbcType, box, &mtop, globalState->x.rvec_array());
}
if (vsite)
{
* for the initial distribution in the domain decomposition
* and for the initial shell prediction.
*/
- constructVsitesGlobal(mtop, globalState->x);
+ constructVirtualSitesGlobal(mtop, globalState->x);
}
}
+ // Make the DD reverse topology, now that any vsites that are present are available
+ if (haveDDAtomOrdering(*cr))
+ {
+ dd_make_reverse_top(fplog, cr->dd, mtop, vsite.get(), *inputrec, domdecOptions.ddBondedChecking);
+ }
if (EEL_PME(fr->ic->eeltype) || EVDW_PME(fr->ic->vdwtype))
{
PmeGpuProgramStorage pmeGpuProgram;
if (thisRankHasPmeGpuTask)
{
- pmeGpuProgram = buildPmeGpuProgram(pmeDeviceInfo);
+ GMX_RELEASE_ASSERT(
+ (deviceStreamManager != nullptr),
+ "GPU device stream manager should be initialized in order to use GPU for PME.");
+ GMX_RELEASE_ASSERT((deviceInfo != nullptr),
+ "GPU device should be initialized in order to use GPU for PME.");
+ pmeGpuProgram = buildPmeGpuProgram(deviceStreamManager->context());
}
/* Initiate PME if necessary,
if (cr->npmenodes > 0)
{
/* The PME only nodes need to know nChargePerturbed(FEP on Q) and nTypePerturbed(FEP on LJ)*/
- gmx_bcast_sim(sizeof(nChargePerturbed), &nChargePerturbed, cr);
- gmx_bcast_sim(sizeof(nTypePerturbed), &nTypePerturbed, cr);
+ gmx_bcast(sizeof(nChargePerturbed), &nChargePerturbed, cr->mpi_comm_mysim);
+ gmx_bcast(sizeof(nTypePerturbed), &nTypePerturbed, cr->mpi_comm_mysim);
}
if (thisRankHasDuty(cr, DUTY_PME))
{
try
{
- pmedata = gmx_pme_init(cr, getNumPmeDomains(cr->dd), inputrec, nChargePerturbed != 0,
- nTypePerturbed != 0, mdrunOptions.reproducible, ewaldcoeff_q,
- ewaldcoeff_lj, gmx_omp_nthreads_get(emntPME), pmeRunMode,
- nullptr, pmeDeviceInfo, pmeGpuProgram.get(), mdlog);
+ // TODO: This should be in the builder.
+ GMX_RELEASE_ASSERT(!runScheduleWork.simulationWork.useGpuPme
+ || (deviceStreamManager != nullptr),
+ "Device stream manager should be valid in order to use GPU "
+ "version of PME.");
+ GMX_RELEASE_ASSERT(
+ !runScheduleWork.simulationWork.useGpuPme
+ || deviceStreamManager->streamIsValid(DeviceStreamType::Pme),
+ "GPU PME stream should be valid in order to use GPU version of PME.");
+
+ const DeviceContext* deviceContext = runScheduleWork.simulationWork.useGpuPme
+ ? &deviceStreamManager->context()
+ : nullptr;
+ const DeviceStream* pmeStream =
+ runScheduleWork.simulationWork.useGpuPme
+ ? &deviceStreamManager->stream(DeviceStreamType::Pme)
+ : nullptr;
+
+ pmedata = gmx_pme_init(cr,
+ getNumPmeDomains(cr->dd),
+ inputrec.get(),
+ nChargePerturbed != 0,
+ nTypePerturbed != 0,
+ mdrunOptions.reproducible,
+ ewaldcoeff_q,
+ ewaldcoeff_lj,
+ gmx_omp_nthreads_get(ModuleMultiThread::Pme),
+ pmeRunMode,
+ nullptr,
+ deviceContext,
+ pmeStream,
+ pmeGpuProgram.get(),
+ mdlog);
}
GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
}
if (thisRankHasDuty(cr, DUTY_PP))
{
/* Assumes uniform use of the number of OpenMP threads */
- walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntDefault));
+ walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(ModuleMultiThread::Default));
if (inputrec->bPull)
{
/* Initialize pull code */
- pull_work = init_pull(fplog, inputrec->pull, inputrec, &mtop, cr, &atomSets,
+ pull_work = init_pull(fplog,
+ inputrec->pull.get(),
+ inputrec.get(),
+ mtop,
+ cr,
+ &atomSets,
inputrec->fepvals->init_lambda);
if (inputrec->pull->bXOutAverage || inputrec->pull->bFOutAverage)
{
if (inputrec->bRot)
{
/* Initialize enforced rotation code */
- enforcedRotation =
- init_rot(fplog, inputrec, filenames.size(), filenames.data(), cr, &atomSets,
- globalState.get(), &mtop, oenv, mdrunOptions, startingBehavior);
+ enforcedRotation = init_rot(fplog,
+ inputrec.get(),
+ filenames.size(),
+ filenames.data(),
+ cr,
+ &atomSets,
+ globalState.get(),
+ mtop,
+ oenv,
+ mdrunOptions,
+ startingBehavior);
}
t_swap* swap = nullptr;
- if (inputrec->eSwapCoords != eswapNO)
+ if (inputrec->eSwapCoords != SwapType::No)
{
/* Initialize ion swapping code */
- swap = init_swapcoords(fplog, inputrec,
+ swap = init_swapcoords(fplog,
+ inputrec.get(),
opt2fn_master("-swap", filenames.size(), filenames.data(), cr),
- &mtop, globalState.get(), &observablesHistory, cr, &atomSets,
- oenv, mdrunOptions, startingBehavior);
+ mtop,
+ globalState.get(),
+ &observablesHistory,
+ cr,
+ &atomSets,
+ oenv,
+ mdrunOptions,
+ startingBehavior);
}
/* Let makeConstraints know whether we have essential dynamics constraints. */
- auto constr = makeConstraints(mtop, *inputrec, pull_work, doEssentialDynamics, fplog,
- *mdAtoms->mdatoms(), cr, ms, &nrnb, wcycle, fr->bMolPBC);
+ auto constr = makeConstraints(mtop,
+ *inputrec,
+ pull_work,
+ doEssentialDynamics,
+ fplog,
+ cr,
+ updateGroups.useUpdateGroups(),
+ ms,
+ &nrnb,
+ wcycle.get(),
+ fr->bMolPBC,
+ &observablesReducerBuilder);
/* Energy terms and groups */
gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
// cos acceleration is only supported by md, but older tpr
// files might still combine it with other integrators
- GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == eiMD,
+ GMX_RELEASE_ASSERT(inputrec->cos_accel == 0.0 || inputrec->eI == IntegrationAlgorithm::MD,
"cos_acceleration is only supported by integrator=md");
/* Kinetic energy data */
- gmx_ekindata_t ekind;
- init_ekindata(fplog, &mtop, &(inputrec->opts), &ekind, inputrec->cos_accel);
+ gmx_ekindata_t ekind(inputrec->opts.ngtc,
+ inputrec->cos_accel,
+ gmx_omp_nthreads_get(ModuleMultiThread::Update));
/* Set up interactive MD (IMD) */
- auto imdSession =
- makeImdSession(inputrec, cr, wcycle, &enerd, ms, &mtop, mdlog,
- MASTER(cr) ? globalState->x.rvec_array() : nullptr, filenames.size(),
- filenames.data(), oenv, mdrunOptions.imdOptions, startingBehavior);
-
- if (DOMAINDECOMP(cr))
+ auto imdSession = makeImdSession(inputrec.get(),
+ cr,
+ wcycle.get(),
+ &enerd,
+ ms,
+ mtop,
+ mdlog,
+ MASTER(cr) ? globalState->x : gmx::ArrayRef<gmx::RVec>(),
+ filenames.size(),
+ filenames.data(),
+ oenv,
+ mdrunOptions.imdOptions,
+ startingBehavior);
+
+ if (haveDDAtomOrdering(*cr))
{
GMX_RELEASE_ASSERT(fr, "fr was NULL while cr->duty was DUTY_PP");
- /* This call is not included in init_domain_decomposition mainly
- * because fr->cginfo_mb is set later.
+ /* This call is not included in init_domain_decomposition
+ * because fr->atomInfoForEachMoleculeBlock is set later.
*/
- dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
- domdecOptions.checkBondedInteractions, fr->cginfo_mb);
+ makeBondedLinks(cr->dd, mtop, fr->atomInfoForEachMoleculeBlock);
}
- // TODO This is not the right place to manage the lifetime of
- // this data structure, but currently it's the easiest way to
- // make it work.
- MdrunScheduleWorkload runScheduleWork;
- // Also populates the simulation constant workload description.
- runScheduleWork.simulationWork = createSimulationWorkload(
- useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate,
- devFlags.enableGpuBufferOps, devFlags.enableGpuHaloExchange,
- devFlags.enableGpuPmePPComm, haveEwaldSurfaceContribution(*inputrec));
+ if (runScheduleWork.simulationWork.useGpuBufferOps)
+ {
+ fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
+ deviceStreamManager->context(),
+ deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal),
+ wcycle.get());
+ fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
+ deviceStreamManager->context(),
+ deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal),
+ wcycle.get());
+ }
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
if (gpusWereDetected
- && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME))
+ && ((runScheduleWork.simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME))
|| runScheduleWork.simulationWork.useGpuBufferOps))
{
- const void* pmeStream = pme_gpu_get_device_stream(fr->pmedata);
- const void* localStream =
- fr->nbv->gpu_nbv != nullptr
- ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local)
- : nullptr;
- const void* nonLocalStream =
- fr->nbv->gpu_nbv != nullptr
- ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal)
- : nullptr;
- const void* deviceContext = pme_gpu_get_device_context(fr->pmedata);
- const int paddingSize = pme_gpu_get_padding_size(fr->pmedata);
- GpuApiCallBehavior transferKind = (inputrec->eI == eiMD && !doRerun && !useModularSimulator)
- ? GpuApiCallBehavior::Async
- : GpuApiCallBehavior::Sync;
-
+ GpuApiCallBehavior transferKind =
+ (inputrec->eI == IntegrationAlgorithm::MD && !doRerun && !useModularSimulator)
+ ? GpuApiCallBehavior::Async
+ : GpuApiCallBehavior::Sync;
+ GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
+ "GPU device stream manager should be initialized to use GPU.");
stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
- pmeStream, localStream, nonLocalStream, deviceContext, transferKind, paddingSize, wcycle);
+ *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle.get());
fr->stateGpu = stateGpu.get();
}
GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
SimulatorBuilder simulatorBuilder;
+ simulatorBuilder.add(SimulatorStateData(
+ globalState.get(), localState, &observablesHistory, &enerd, &ekind));
+ simulatorBuilder.add(std::move(membedHolder));
+ simulatorBuilder.add(std::move(stopHandlerBuilder_));
+ simulatorBuilder.add(SimulatorConfig(mdrunOptions, startingBehavior, &runScheduleWork));
+
+
+ simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv, &observablesReducerBuilder));
+ simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle.get()));
+ simulatorBuilder.add(ConstraintsParam(
+ constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, vsite.get()));
+ // TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly.
+ simulatorBuilder.add(LegacyInput(
+ static_cast<int>(filenames.size()), filenames.data(), inputrec.get(), fr.get()));
+ simulatorBuilder.add(ReplicaExchangeParameters(replExParams));
+ simulatorBuilder.add(InteractiveMD(imdSession.get()));
+ simulatorBuilder.add(SimulatorModules(mdModules_->outputProvider(), mdModules_->notifiers()));
+ simulatorBuilder.add(CenterOfMassPulling(pull_work));
+ // Todo move to an MDModule
+ simulatorBuilder.add(IonSwapping(swap));
+ simulatorBuilder.add(TopologyData(mtop, &localTopology, mdAtoms.get()));
+ simulatorBuilder.add(BoxDeformationHandle(deform.get()));
+ simulatorBuilder.add(std::move(modularSimulatorCheckpointData));
+
// build and run simulator object based on user-input
- auto simulator = simulatorBuilder.build(
- inputIsCompatibleWithModularSimulator, fplog, cr, ms, mdlog,
- static_cast<int>(filenames.size()), filenames.data(), oenv, mdrunOptions,
- startingBehavior, vsite.get(), constr.get(),
- enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, deform.get(),
- mdModules_->outputProvider(), mdModules_->notifier(), inputrec, imdSession.get(),
- pull_work, swap, &mtop, fcd, globalState.get(), &observablesHistory, mdAtoms.get(),
- &nrnb, wcycle, fr, &enerd, &ekind, &runScheduleWork, replExParams, membed,
- walltime_accounting, std::move(stopHandlerBuilder_), doRerun);
+ auto simulator = simulatorBuilder.build(useModularSimulator);
simulator->run();
if (fr->pmePpCommGpu)
{
GMX_RELEASE_ASSERT(pmedata, "pmedata was NULL while cr->duty was not DUTY_PP");
/* do PME only */
- walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(emntPME));
- gmx_pmeonly(pmedata, cr, &nrnb, wcycle, walltime_accounting, inputrec, pmeRunMode);
+ walltime_accounting = walltime_accounting_init(gmx_omp_nthreads_get(ModuleMultiThread::Pme));
+ gmx_pmeonly(pmedata,
+ cr,
+ &nrnb,
+ wcycle.get(),
+ walltime_accounting,
+ inputrec.get(),
+ pmeRunMode,
+ runScheduleWork.simulationWork.useGpuPmePpCommunication,
+ deviceStreamManager.get());
}
- wallcycle_stop(wcycle, ewcRUN);
+ wallcycle_stop(wcycle.get(), WallCycleCounter::Run);
/* Finish up, write some stuff
* if rerunMD, don't write last frame again
*/
- finish_run(fplog, mdlog, cr, inputrec, &nrnb, wcycle, walltime_accounting,
- fr ? fr->nbv.get() : nullptr, pmedata, EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
-
- // clean up cycle counter
- wallcycle_destroy(wcycle);
-
+ finish_run(fplog,
+ mdlog,
+ cr,
+ *inputrec,
+ &nrnb,
+ wcycle.get(),
+ walltime_accounting,
+ fr ? fr->nbv.get() : nullptr,
+ pmedata,
+ EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
+
+
+ deviceStreamManager.reset(nullptr);
// Free PME data
if (pmedata)
{
}
// FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
- // before we destroy the GPU context(s) in free_gpu_resources().
+ // before we destroy the GPU context(s)
// Pinned buffers are associated with contexts in CUDA.
// As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
mdAtoms.reset(nullptr);
globalState.reset(nullptr);
+ localStateInstance.reset(nullptr);
mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
+ fr.reset(nullptr); // destruct forcerec before gpu
+ // TODO convert to C++ so we can get rid of these frees
+ sfree(disresdata);
- /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
- free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
- free_gpu(nonbondedDeviceInfo);
- free_gpu(pmeDeviceInfo);
- done_forcerec(fr, mtop.molblock.size());
- sfree(fcd);
+ if (!hwinfo_->deviceInfoList.empty())
+ {
+ /* stop the GPU profiler (only CUDA) */
+ stopGpuProfiler();
+ }
- if (doMembed)
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the CUDA context as some tMPI ranks may be sharing
+ * GPU and context.
+ *
+ * This is not a concern in OpenCL where we use one context per rank.
+ *
+ * Note: it is safe to not call the barrier on the ranks which do not use GPU,
+ * but it is easier and more futureproof to call it on the whole node.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ * \todo Remove this physical node barrier after making sure
+ * that it's not needed anymore (with a shared GPU run).
+ */
+ if (GMX_THREAD_MPI)
{
- free_membed(membed);
+ physicalNodeComm.barrier();
+ }
+
+ if (!devFlags.usingCudaAwareMpi)
+ {
+ // Don't reset GPU in case of CUDA-AWARE MPI
+ // UCX creates CUDA buffers which are cleaned-up as part of MPI_Finalize()
+ // resetting the device before MPI_Finalize() results in crashes inside UCX
+ releaseDevice(deviceInfo);
}
/* Does what it says */
/* we need to join all threads. The sub-threads join when they
exit this function, but the master thread needs to be told to
wait for that. */
- if (PAR(cr) && MASTER(cr))
+ if (MASTER(cr))
{
tMPI_Finalize();
}
#endif
return rc;
-}
+} // namespace gmx
Mdrunner::~Mdrunner()
{
Mdrunner::Mdrunner(Mdrunner&&) noexcept = default;
-//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265
+//NOLINTNEXTLINE(performance-noexcept-move-constructor) working around GCC bug 58265 in CentOS 7
Mdrunner& Mdrunner::operator=(Mdrunner&& /*handle*/) noexcept(BUGFREE_NOEXCEPT_STRING) = default;
class Mdrunner::BuilderImplementation
real forceWarningThreshold,
StartingBehavior startingBehavior);
+ void addHardwareDetectionResult(const gmx_hw_info_t* hwinfo);
+
void addDomdec(const DomdecOptions& options);
+ void addInput(SimulationInputHandle inputHolder);
+
void addVerletList(int nstlist);
void addReplicaExchange(const ReplicaExchangeParameters& params);
//! Command-line override for the duration of a neighbor list with the Verlet scheme.
int nstlist_ = 0;
+ //! World communicator, used for hardware detection and task assignment
+ MPI_Comm libraryWorldCommunicator_ = MPI_COMM_NULL;
+
//! Multisim communicator handle.
gmx_multisim_t* multiSimulation_;
//! mdrun communicator
- MPI_Comm communicator_ = MPI_COMM_NULL;
+ MPI_Comm simulationCommunicator_ = MPI_COMM_NULL;
//! Print a warning if any force is larger than this (in kJ/mol nm).
real forceWarningThreshold_ = -1;
//! The modules that comprise the functionality of mdrun.
std::unique_ptr<MDModules> mdModules_;
+ //! Detected hardware.
+ const gmx_hw_info_t* hwinfo_ = nullptr;
+
//! \brief Parallelism information.
gmx_hw_opt_t hardwareOptions_;
* \brief Builder for simulation stop signal handler.
*/
std::unique_ptr<StopHandlerBuilder> stopHandlerBuilder_ = nullptr;
+
+ /*!
+ * \brief Sources for initial simulation state.
+ *
+ * See issue #3652 for near-term refinements to the SimulationInput interface.
+ *
+ * See issue #3379 for broader discussion on API aspects of simulation inputs and outputs.
+ */
+ SimulationInputHandle inputHolder_;
};
Mdrunner::BuilderImplementation::BuilderImplementation(std::unique_ptr<MDModules> mdModules,
compat::not_null<SimulationContext*> context) :
mdModules_(std::move(mdModules))
{
- communicator_ = context->communicator_;
- multiSimulation_ = context->multiSimulation_.get();
+ libraryWorldCommunicator_ = context->libraryWorldCommunicator_;
+ simulationCommunicator_ = context->simulationCommunicator_;
+ multiSimulation_ = context->multiSimulation_.get();
}
Mdrunner::BuilderImplementation::~BuilderImplementation() = default;
newRunner.filenames = filenames_;
- newRunner.communicator = communicator_;
+ newRunner.libraryWorldCommunicator = libraryWorldCommunicator_;
+
+ newRunner.simulationCommunicator = simulationCommunicator_;
// nullptr is a valid value for the multisim handle
newRunner.ms = multiSimulation_;
+ if (hwinfo_)
+ {
+ newRunner.hwinfo_ = hwinfo_;
+ }
+ else
+ {
+ GMX_THROW(gmx::APIError(
+ "MdrunnerBuilder::addHardwareDetectionResult() is required before build()"));
+ }
+
+ if (inputHolder_)
+ {
+ newRunner.inputHolder_ = std::move(inputHolder_);
+ }
+ else
+ {
+ GMX_THROW(gmx::APIError("MdrunnerBuilder::addInput() is required before build()."));
+ }
+
// \todo Clarify ownership and lifetime management for gmx_output_env_t
// \todo Update sanity checking when output environment has clearly specified invariants.
// Initialization and default values for oenv are not well specified in the current version.
return newRunner;
}
+void Mdrunner::BuilderImplementation::addHardwareDetectionResult(const gmx_hw_info_t* hwinfo)
+{
+ hwinfo_ = hwinfo;
+}
+
void Mdrunner::BuilderImplementation::addNonBonded(const char* nbpu_opt)
{
nbpu_opt_ = nbpu_opt;
stopHandlerBuilder_ = std::move(builder);
}
+void Mdrunner::BuilderImplementation::addInput(SimulationInputHandle inputHolder)
+{
+ inputHolder_ = std::move(inputHolder);
+}
+
MdrunnerBuilder::MdrunnerBuilder(std::unique_ptr<MDModules> mdModules,
compat::not_null<SimulationContext*> context) :
impl_{ std::make_unique<Mdrunner::BuilderImplementation>(std::move(mdModules), context) }
MdrunnerBuilder::~MdrunnerBuilder() = default;
+MdrunnerBuilder& MdrunnerBuilder::addHardwareDetectionResult(const gmx_hw_info_t* hwinfo)
+{
+ impl_->addHardwareDetectionResult(hwinfo);
+ return *this;
+}
+
MdrunnerBuilder& MdrunnerBuilder::addSimulationMethod(const MdrunOptions& options,
real forceWarningThreshold,
const StartingBehavior startingBehavior)
return *this;
}
+MdrunnerBuilder& MdrunnerBuilder::addInput(SimulationInputHandle input)
+{
+ impl_->addInput(std::move(input));
+ return *this;
+}
+
MdrunnerBuilder::MdrunnerBuilder(MdrunnerBuilder&&) noexcept = default;
MdrunnerBuilder& MdrunnerBuilder::operator=(MdrunnerBuilder&&) noexcept = default;