#include "gromacs/domdec/localatomsetmanager.h"
#include "gromacs/domdec/partition.h"
#include "gromacs/ewald/ewald_utils.h"
-#include "gromacs/ewald/pme.h"
#include "gromacs/ewald/pme_gpu_program.h"
+#include "gromacs/ewald/pme_only.h"
#include "gromacs/ewald/pme_pp_comm_gpu.h"
#include "gromacs/fileio/checkpoint.h"
#include "gromacs/fileio/gmxfio.h"
#include "gromacs/mdlib/sighandler.h"
#include "gromacs/mdlib/stophandler.h"
#include "gromacs/mdlib/updategroups.h"
+#include "gromacs/mdlib/vsite.h"
#include "gromacs/mdrun/mdmodules.h"
#include "gromacs/mdrun/simulationcontext.h"
#include "gromacs/mdrunutility/handlerestart.h"
#include "gromacs/mdtypes/commrec.h"
#include "gromacs/mdtypes/enerdata.h"
#include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
#include "gromacs/mdtypes/group.h"
#include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
#include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
#include "gromacs/mdtypes/mdrunoptions.h"
#include "gromacs/mdtypes/observableshistory.h"
#include "gromacs/mdtypes/simulation_workload.h"
GMX_LOG(mdlog.warning)
.asParagraph()
.appendTextFormatted(
- "This run uses the 'GPU halo exchange' feature, enabled by the "
+ "This run has requested the 'GPU halo exchange' feature, enabled by "
+ "the "
"GMX_GPU_DD_COMMS environment variable.");
}
else
bool makeGpuPairList,
const gmx::CpuInfo& cpuinfo)
{
+ // We checked the cut-offs in grompp, but double-check here.
+ // We have PME+LJcutoff kernels for rcoulomb>rvdw.
+ if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == eelCUT)
+ {
+ GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
+ "With Verlet lists and PME we should have rcoulomb>=rvdw");
+ }
+ else
+ {
+ GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw,
+ "With Verlet lists and no PME rcoulomb and rvdw should be identical");
+ }
/* For NVE simulations, we will retain the initial list buffer */
if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
{
// the inputrec read by the master rank. The ranks can now all run
// the task-deciding functions and will agree on the result
// without needing to communicate.
- //
- // TODO Should we do the communication in debug mode to support
- // having an assertion?
const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
// Note that these variables describe only their own node.
// TODO: Error handling
mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
- const auto& mdModulesNotifier = mdModules_->notifier().notifier_;
+ const auto& mdModulesNotifier = mdModules_->notifier().simulationSetupNotifications_;
if (inputrec->internalParameters != nullptr)
{
cr->npmenodes = 0;
cr->duty = (DUTY_PP | DUTY_PME);
- if (inputrec->ePBC == epbcSCREW)
+ if (inputrec->pbcType == PbcType::Screw)
{
gmx_fatal(FARGS, "pbc=screw is only implemented with domain decomposition");
}
EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
// Get the device handles for the modules, nullptr when no task is assigned.
- gmx_device_info_t* nonbondedDeviceInfo = gpuTaskAssignments.initNonbondedDevice(cr);
- gmx_device_info_t* pmeDeviceInfo = gpuTaskAssignments.initPmeDevice();
+ DeviceInformation* nonbondedDeviceInfo = gpuTaskAssignments.initNonbondedDevice(cr);
+ DeviceInformation* pmeDeviceInfo = gpuTaskAssignments.initPmeDevice();
// TODO Initialize GPU streams here.
const bool thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
std::unique_ptr<MDAtoms> mdAtoms;
std::unique_ptr<gmx_vsite_t> vsite;
+ std::unique_ptr<GpuBonded> gpuBonded;
t_nrnb nrnb;
if (thisRankHasDuty(cr, DUTY_PP))
{
mdModulesNotifier.notify(*cr);
mdModulesNotifier.notify(&atomSets);
- mdModulesNotifier.notify(PeriodicBoundaryConditionType{ inputrec->ePBC });
+ mdModulesNotifier.notify(inputrec->pbcType);
mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
/* Initiate forcerecord */
fr = new t_forcerec;
init_forcerec(fplog, mdlog, fr, fcd, inputrec, &mtop, cr, box,
opt2fn("-table", filenames.size(), filenames.data()),
opt2fn("-tablep", filenames.size(), filenames.data()),
- opt2fns("-tableb", filenames.size(), filenames.data()), *hwinfo,
- nonbondedDeviceInfo, useGpuForBonded,
- pmeRunMode == PmeRunMode::GPU && !thisRankHasDuty(cr, DUTY_PME), pforce, wcycle);
-
- // TODO Move this to happen during domain decomposition setup,
- // once stream and event handling works well with that.
- // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
- if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
+ opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
+
+ if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
{
- GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps,
- "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
- void* streamLocal =
- Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
- void* streamNonLocal =
- Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
- GMX_LOG(mdlog.warning)
- .asParagraph()
- .appendTextFormatted(
- "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
- "GMX_GPU_DD_COMMS environment variable.");
- cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(
- cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal);
+ fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(cr->mpi_comm_mysim, cr->dd->pme_nodeid);
+ }
+
+ fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, nonbondedDeviceInfo,
+ &mtop, box, wcycle);
+ if (useGpuForBonded)
+ {
+ auto stream = havePPDomainDecomposition(cr)
+ ? Nbnxm::gpu_get_command_stream(
+ fr->nbv->gpu_nbv, gmx::InteractionLocality::NonLocal)
+ : Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv,
+ gmx::InteractionLocality::Local);
+ gpuBonded = std::make_unique<GpuBonded>(mtop.ffparams, stream, wcycle);
+ fr->gpuBonded = gpuBonded.get();
}
/* Initialize the mdAtoms structure.
/* With periodic molecules the charge groups should be whole at start up
* and the virtual sites should not be far from their proper positions.
*/
- if (!inputrec->bContinuation && MASTER(cr) && !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+ if (!inputrec->bContinuation && MASTER(cr)
+ && !(inputrec->pbcType != PbcType::No && inputrec->bPeriodicMols))
{
/* Make molecules whole at start of run */
- if (fr->ePBC != epbcNONE)
+ if (fr->pbcType != PbcType::No)
{
- do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
+ do_pbc_first_mtop(fplog, inputrec->pbcType, box, &mtop, globalState->x.rvec_array());
}
if (vsite)
{
/* This call is not included in init_domain_decomposition mainly
* because fr->cginfo_mb is set later.
*/
- dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+ dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec,
domdecOptions.checkBondedInteractions, fr->cginfo_mb);
}
// make it work.
MdrunScheduleWorkload runScheduleWork;
// Also populates the simulation constant workload description.
- runScheduleWork.simulationWork = createSimulationWorkload(
- useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate,
- devFlags.enableGpuBufferOps, devFlags.enableGpuHaloExchange,
- devFlags.enableGpuPmePPComm, haveEwaldSurfaceContribution(*inputrec));
+ runScheduleWork.simulationWork =
+ createSimulationWorkload(*inputrec, useGpuForNonbonded, pmeRunMode, useGpuForBonded,
+ useGpuForUpdate, devFlags.enableGpuBufferOps,
+ devFlags.enableGpuHaloExchange, devFlags.enableGpuPmePPComm);
std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
if (gpusWereDetected
}
// FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
- // before we destroy the GPU context(s) in free_gpu_resources().
+ // before we destroy the GPU context(s) in free_gpu().
// Pinned buffers are associated with contexts in CUDA.
// As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
mdAtoms.reset(nullptr);
globalState.reset(nullptr);
mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
+ gpuBonded.reset(nullptr);
+ /* Free pinned buffers in *fr */
+ delete fr;
+ fr = nullptr;
+
+ if (hwinfo->gpu_info.n_dev > 0)
+ {
+ /* stop the GPU profiler (only CUDA) */
+ stopGpuProfiler();
+ }
+
+ /* With tMPI we need to wait for all ranks to finish deallocation before
+ * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
+ * GPU and context.
+ *
+ * This is not a concern in OpenCL where we use one context per rank which
+ * is freed in nbnxn_gpu_free().
+ *
+ * Note: it is safe to not call the barrier on the ranks which do not use GPU,
+ * but it is easier and more futureproof to call it on the whole node.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ * \todo Remove this physical node barrier after making sure
+ * that it's not needed anymore (with a shared GPU run).
+ */
+ if (GMX_THREAD_MPI)
+ {
+ physicalNodeComm.barrier();
+ }
- /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
- free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
free_gpu(nonbondedDeviceInfo);
free_gpu(pmeDeviceInfo);
- done_forcerec(fr, mtop.molblock.size());
sfree(fcd);
if (doMembed)