Merge branch release-2020 into master

[alexxy/gromacs.git] / src / gromacs / mdrun / runner.cpp
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 93c934a9965807922dc74da7d12fb80030afc069..6827e7e9de8ca19bb8b710788e2b79a406e8df3c 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -64,8 +64,8 @@
  #include "gromacs/domdec/localatomsetmanager.h"
  #include "gromacs/domdec/partition.h"
  #include "gromacs/ewald/ewald_utils.h"
-#include "gromacs/ewald/pme.h"
  #include "gromacs/ewald/pme_gpu_program.h"
+#include "gromacs/ewald/pme_only.h"
  #include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/fileio/checkpoint.h"
  #include "gromacs/fileio/gmxfio.h"
@@ -100,6 +100,7 @@
  #include "gromacs/mdlib/sighandler.h"
  #include "gromacs/mdlib/stophandler.h"
  #include "gromacs/mdlib/updategroups.h"
+#include "gromacs/mdlib/vsite.h"
  #include "gromacs/mdrun/mdmodules.h"
  #include "gromacs/mdrun/simulationcontext.h"
  #include "gromacs/mdrunutility/handlerestart.h"
@@ -110,9 +111,12 @@
  #include "gromacs/mdtypes/commrec.h"
  #include "gromacs/mdtypes/enerdata.h"
  #include "gromacs/mdtypes/fcdata.h"
+#include "gromacs/mdtypes/forcerec.h"
  #include "gromacs/mdtypes/group.h"
  #include "gromacs/mdtypes/inputrec.h"
+#include "gromacs/mdtypes/interaction_const.h"
  #include "gromacs/mdtypes/md_enums.h"
+#include "gromacs/mdtypes/mdatom.h"
  #include "gromacs/mdtypes/mdrunoptions.h"
  #include "gromacs/mdtypes/observableshistory.h"
  #include "gromacs/mdtypes/simulation_workload.h"
@@ -238,7 +242,8 @@ static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger& md
              GMX_LOG(mdlog.warning)
                      .asParagraph()
                      .appendTextFormatted(
-                            "This run uses the 'GPU halo exchange' feature, enabled by the "
+                            "This run has requested the 'GPU halo exchange' feature, enabled by "
+                            "the "
                              "GMX_GPU_DD_COMMS environment variable.");
          }
          else
@@ -395,6 +400,18 @@ static void prepare_verlet_scheme(FILE*               fplog,
                                    bool                makeGpuPairList,
                                    const gmx::CpuInfo& cpuinfo)
  {
+    // We checked the cut-offs in grompp, but double-check here.
+    // We have PME+LJcutoff kernels for rcoulomb>rvdw.
+    if (EEL_PME_EWALD(ir->coulombtype) && ir->vdwtype == eelCUT)
+    {
+        GMX_RELEASE_ASSERT(ir->rcoulomb >= ir->rvdw,
+                           "With Verlet lists and PME we should have rcoulomb>=rvdw");
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(ir->rcoulomb == ir->rvdw,
+                           "With Verlet lists and no PME rcoulomb and rvdw should be identical");
+    }
      /* For NVE simulations, we will retain the initial list buffer */
      if (EI_DYNAMICS(ir->eI) && ir->verletbuf_tol > 0 && !(EI_MD(ir->eI) && ir->etc == etcNO))
      {
@@ -833,9 +850,6 @@ int Mdrunner::mdrunner()
      // the inputrec read by the master rank. The ranks can now all run
      // the task-deciding functions and will agree on the result
      // without needing to communicate.
-    //
-    // TODO Should we do the communication in debug mode to support
-    // having an assertion?
      const bool useDomainDecomposition = (PAR(cr) && !(EI_TPI(inputrec->eI) || inputrec->eI == eiNM));
  
      // Note that these variables describe only their own node.
@@ -896,7 +910,7 @@ int Mdrunner::mdrunner()
  
      // TODO: Error handling
      mdModules_->assignOptionsToModules(*inputrec->params, nullptr);
-    const auto& mdModulesNotifier = mdModules_->notifier().notifier_;
+    const auto& mdModulesNotifier = mdModules_->notifier().simulationSetupNotifications_;
  
      if (inputrec->internalParameters != nullptr)
      {
@@ -1109,7 +1123,7 @@ int Mdrunner::mdrunner()
          cr->npmenodes = 0;
          cr->duty      = (DUTY_PP | DUTY_PME);
  
-        if (inputrec->ePBC == epbcSCREW)
+        if (inputrec->pbcType == PbcType::Screw)
          {
              gmx_fatal(FARGS, "pbc=screw is only implemented with domain decomposition");
          }
@@ -1126,8 +1140,8 @@ int Mdrunner::mdrunner()
              EEL_PME(inputrec->coulombtype) && thisRankHasDuty(cr, DUTY_PME));
  
      // Get the device handles for the modules, nullptr when no task is assigned.
-    gmx_device_info_t* nonbondedDeviceInfo = gpuTaskAssignments.initNonbondedDevice(cr);
-    gmx_device_info_t* pmeDeviceInfo       = gpuTaskAssignments.initPmeDevice();
+    DeviceInformation* nonbondedDeviceInfo = gpuTaskAssignments.initNonbondedDevice(cr);
+    DeviceInformation* pmeDeviceInfo       = gpuTaskAssignments.initPmeDevice();
  
      // TODO Initialize GPU streams here.
  
@@ -1307,13 +1321,14 @@ int Mdrunner::mdrunner()
      const bool                   thisRankHasPmeGpuTask = gpuTaskAssignments.thisRankHasPmeGpuTask();
      std::unique_ptr<MDAtoms>     mdAtoms;
      std::unique_ptr<gmx_vsite_t> vsite;
+    std::unique_ptr<GpuBonded>   gpuBonded;
  
      t_nrnb nrnb;
      if (thisRankHasDuty(cr, DUTY_PP))
      {
          mdModulesNotifier.notify(*cr);
          mdModulesNotifier.notify(&atomSets);
-        mdModulesNotifier.notify(PeriodicBoundaryConditionType{ inputrec->ePBC });
+        mdModulesNotifier.notify(inputrec->pbcType);
          mdModulesNotifier.notify(SimulationTimeStep{ inputrec->delta_t });
          /* Initiate forcerecord */
          fr                 = new t_forcerec;
@@ -1321,28 +1336,24 @@ int Mdrunner::mdrunner()
          init_forcerec(fplog, mdlog, fr, fcd, inputrec, &mtop, cr, box,
                        opt2fn("-table", filenames.size(), filenames.data()),
                        opt2fn("-tablep", filenames.size(), filenames.data()),
-                      opt2fns("-tableb", filenames.size(), filenames.data()), *hwinfo,
-                      nonbondedDeviceInfo, useGpuForBonded,
-                      pmeRunMode == PmeRunMode::GPU && !thisRankHasDuty(cr, DUTY_PME), pforce, wcycle);
-
-        // TODO Move this to happen during domain decomposition setup,
-        // once stream and event handling works well with that.
-        // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
-        if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
+                      opt2fns("-tableb", filenames.size(), filenames.data()), pforce);
+
+        if (devFlags.enableGpuPmePPComm && !thisRankHasDuty(cr, DUTY_PME))
          {
-            GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps,
-                               "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
-            void* streamLocal =
-                    Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
-            void* streamNonLocal =
-                    Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
-            GMX_LOG(mdlog.warning)
-                    .asParagraph()
-                    .appendTextFormatted(
-                            "NOTE: This run uses the 'GPU halo exchange' feature, enabled by the "
-                            "GMX_GPU_DD_COMMS environment variable.");
-            cr->dd->gpuHaloExchange = std::make_unique<GpuHaloExchange>(
-                    cr->dd, cr->mpi_comm_mysim, streamLocal, streamNonLocal);
+            fr->pmePpCommGpu = std::make_unique<gmx::PmePpCommGpu>(cr->mpi_comm_mysim, cr->dd->pme_nodeid);
+        }
+
+        fr->nbv = Nbnxm::init_nb_verlet(mdlog, inputrec, fr, cr, *hwinfo, nonbondedDeviceInfo,
+                                        &mtop, box, wcycle);
+        if (useGpuForBonded)
+        {
+            auto stream = havePPDomainDecomposition(cr)
+                                  ? Nbnxm::gpu_get_command_stream(
+                                            fr->nbv->gpu_nbv, gmx::InteractionLocality::NonLocal)
+                                  : Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv,
+                                                                  gmx::InteractionLocality::Local);
+            gpuBonded     = std::make_unique<GpuBonded>(mtop.ffparams, stream, wcycle);
+            fr->gpuBonded = gpuBonded.get();
          }
  
          /* Initialize the mdAtoms structure.
@@ -1368,12 +1379,13 @@ int Mdrunner::mdrunner()
          /* With periodic molecules the charge groups should be whole at start up
           * and the virtual sites should not be far from their proper positions.
           */
-        if (!inputrec->bContinuation && MASTER(cr) && !(inputrec->ePBC != epbcNONE && inputrec->bPeriodicMols))
+        if (!inputrec->bContinuation && MASTER(cr)
+            && !(inputrec->pbcType != PbcType::No && inputrec->bPeriodicMols))
          {
              /* Make molecules whole at start of run */
-            if (fr->ePBC != epbcNONE)
+            if (fr->pbcType != PbcType::No)
              {
-                do_pbc_first_mtop(fplog, inputrec->ePBC, box, &mtop, globalState->x.rvec_array());
+                do_pbc_first_mtop(fplog, inputrec->pbcType, box, &mtop, globalState->x.rvec_array());
              }
              if (vsite)
              {
@@ -1526,7 +1538,7 @@ int Mdrunner::mdrunner()
              /* This call is not included in init_domain_decomposition mainly
               * because fr->cginfo_mb is set later.
               */
-            dd_init_bondeds(fplog, cr->dd, &mtop, vsite.get(), inputrec,
+            dd_init_bondeds(fplog, cr->dd, mtop, vsite.get(), inputrec,
                              domdecOptions.checkBondedInteractions, fr->cginfo_mb);
          }
  
@@ -1535,10 +1547,10 @@ int Mdrunner::mdrunner()
          // make it work.
          MdrunScheduleWorkload runScheduleWork;
          // Also populates the simulation constant workload description.
-        runScheduleWork.simulationWork = createSimulationWorkload(
-                useGpuForNonbonded, pmeRunMode, useGpuForBonded, useGpuForUpdate,
-                devFlags.enableGpuBufferOps, devFlags.enableGpuHaloExchange,
-                devFlags.enableGpuPmePPComm, haveEwaldSurfaceContribution(*inputrec));
+        runScheduleWork.simulationWork =
+                createSimulationWorkload(*inputrec, useGpuForNonbonded, pmeRunMode, useGpuForBonded,
+                                         useGpuForUpdate, devFlags.enableGpuBufferOps,
+                                         devFlags.enableGpuHaloExchange, devFlags.enableGpuPmePPComm);
  
          std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
          if (gpusWereDetected
@@ -1619,18 +1631,46 @@ int Mdrunner::mdrunner()
      }
  
      // FIXME: this is only here to manually unpin mdAtoms->chargeA_ and state->x,
-    // before we destroy the GPU context(s) in free_gpu_resources().
+    // before we destroy the GPU context(s) in free_gpu().
      // Pinned buffers are associated with contexts in CUDA.
      // As soon as we destroy GPU contexts after mdrunner() exits, these lines should go.
      mdAtoms.reset(nullptr);
      globalState.reset(nullptr);
      mdModules_.reset(nullptr); // destruct force providers here as they might also use the GPU
+    gpuBonded.reset(nullptr);
+    /* Free pinned buffers in *fr */
+    delete fr;
+    fr = nullptr;
+
+    if (hwinfo->gpu_info.n_dev > 0)
+    {
+        /* stop the GPU profiler (only CUDA) */
+        stopGpuProfiler();
+    }
+
+    /* With tMPI we need to wait for all ranks to finish deallocation before
+     * destroying the CUDA context in free_gpu() as some tMPI ranks may be sharing
+     * GPU and context.
+     *
+     * This is not a concern in OpenCL where we use one context per rank which
+     * is freed in nbnxn_gpu_free().
+     *
+     * Note: it is safe to not call the barrier on the ranks which do not use GPU,
+     * but it is easier and more futureproof to call it on the whole node.
+     *
+     * Note that this function needs to be called even if GPUs are not used
+     * in this run because the PME ranks have no knowledge of whether GPUs
+     * are used or not, but all ranks need to enter the barrier below.
+     * \todo Remove this physical node barrier after making sure
+     * that it's not needed anymore (with a shared GPU run).
+     */
+    if (GMX_THREAD_MPI)
+    {
+        physicalNodeComm.barrier();
+    }
  
-    /* Free GPU memory and set a physical node tMPI barrier (which should eventually go away) */
-    free_gpu_resources(fr, physicalNodeComm, hwinfo->gpu_info);
      free_gpu(nonbondedDeviceInfo);
      free_gpu(pmeDeviceInfo);
-    done_forcerec(fr, mtop.molblock.size());
      sfree(fcd);
  
      if (doMembed)