Merge "Merge branch 'release-2019' into master"

author Paul Bauer <paul.bauer.q@gmail.com>

Tue, 29 Oct 2019 13:40:41 +0000 (14:40 +0100)

committer Paul Bauer <paul.bauer.q@gmail.com>

Tue, 29 Oct 2019 13:40:41 +0000 (14:40 +0100)
author Paul Bauer <paul.bauer.q@gmail.com>
Tue, 29 Oct 2019 13:40:41 +0000 (14:40 +0100)
committer Paul Bauer <paul.bauer.q@gmail.com>
Tue, 29 Oct 2019 13:40:41 +0000 (14:40 +0100)
diff --git a/admin/builds/gromacs.py b/admin/builds/gromacs.py

index 4fa518f1864865c2b4d442d5e6b9d8d9d9e87590..49007c5a63f125a098c719b7e73bf4b1b005d2e2 100644 (file)
--- a/admin/builds/gromacs.py
+++ b/admin/builds/gromacs.py
@@ -185,7 +185,6 @@ def do_build(context):
  
      # GPU update flag enables GPU update+constraints as well as buffer ops (dependency)
      if context.opts.gpuupdate:
-        context.env.set_env_var('GMX_USE_GPU_BUFFER_OPS', "1")
          context.env.set_env_var('GMX_FORCE_UPDATE_DEFAULT_GPU', "1")
  
      regressiontests_path = context.workspace.get_project_dir(Project.REGRESSIONTESTS)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index f9fd797164b6bb6281909cccc4d494d23e39827c..5962a06bb771b313781c6789fa8f21e4ea5c0a1e 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -777,14 +777,15 @@ setupForceOutputs(t_forcerec                          *fr,
  /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
   */
  static DomainLifetimeWorkload
-setupDomainLifetimeWorkload(const t_inputrec       &inputrec,
-                            const t_forcerec       &fr,
-                            const pull_t           *pull_work,
-                            const gmx_edsam        *ed,
-                            const t_idef           &idef,
-                            const t_fcdata         &fcd,
-                            const t_mdatoms        &mdatoms,
-                            const StepWorkload     &stepWork)
+setupDomainLifetimeWorkload(const t_inputrec         &inputrec,
+                            const t_forcerec         &fr,
+                            const pull_t             *pull_work,
+                            const gmx_edsam          *ed,
+                            const t_idef             &idef,
+                            const t_fcdata           &fcd,
+                            const t_mdatoms          &mdatoms,
+                            const SimulationWorkload &simulationWork,
+                            const StepWorkload       &stepWork)
  {
      DomainLifetimeWorkload domainWork;
      // Note that haveSpecialForces is constant over the whole run
@@ -795,6 +796,10 @@ setupDomainLifetimeWorkload(const t_inputrec       &inputrec,
      domainWork.haveCpuListedForceWork = haveCpuListedForces(fr, idef, fcd);
      // Note that haveFreeEnergyWork is constant over the whole run
      domainWork.haveFreeEnergyWork     = (fr.efep != efepNO && mdatoms.nPerturbed != 0);
+    // We assume we have local force work if there are CPU
+    // force tasks including PME or nonbondeds.
+    domainWork.haveCpuLocalForceWork  = domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || domainWork.haveFreeEnergyWork ||
+        simulationWork.useCpuNonbonded || simulationWork.useCpuPme;
      return domainWork;
  }
  
@@ -832,8 +837,8 @@ setupStepWorkload(const int                 legacyFlags,
      // on virial steps the CPU reduction path is taken
      // TODO: remove flags.computeEnergy, ref #3128
      flags.useGpuFBufferOps    = simulationWork.useGpuBufferOps && !(flags.computeVirial || flags.computeEnergy);
-    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.usePmeGpu &&
-                                                           (rankHasPmeDuty || simulationWork.useGpuPmePPCommunication));
+    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.useGpuPme &&
+                                                           (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication));
  
      return flags;
  }
@@ -943,7 +948,7 @@ void do_force(FILE                                     *fplog,
      const StepWorkload &stepWork = runScheduleWork->stepWork;
  
  
-    const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
+    const bool useGpuPmeOnThisRank = simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME);
      const int  pmeFlags            = makePmeFlags(stepWork);
  
      // Switches on whether to use GPU for position and force buffer operations
@@ -1029,6 +1034,15 @@ void do_force(FILE                                     *fplog,
          }
      }
  
+    // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on the CPU. At search steps the
+    // current coordinates are already on the host, hence copy is not needed.
+    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch &&
+        runScheduleWork->domainWork.haveCpuLocalForceWork)
+    {
+        stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+    }
+
  #if GMX_MPI
      if (!thisRankHasDuty(cr, DUTY_PME))
      {
@@ -1037,12 +1051,12 @@ void do_force(FILE                                     *fplog,
           * and domain decomposition does not use the graph,
           * we do not need to worry about shifting.
           */
-        bool reinitGpuPmePpComms    = simulationWork.useGpuPmePPCommunication && (stepWork.doNeighborSearch);
-        bool sendCoordinatesFromGpu = simulationWork.useGpuPmePPCommunication && !(stepWork.doNeighborSearch);
+        bool reinitGpuPmePpComms    = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
+        bool sendCoordinatesFromGpu = simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
          gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
                                   lambda[efptCOUL], lambda[efptVDW],
                                   (stepWork.computeVirial || stepWork.computeEnergy),
-                                 step, simulationWork.useGpuPmePPCommunication, reinitGpuPmePpComms,
+                                 step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
                                   sendCoordinatesFromGpu, wcycle);
      }
  #endif /* GMX_MPI */
@@ -1140,6 +1154,7 @@ void do_force(FILE                                     *fplog,
                                          top->idef,
                                          *fcd,
                                          *mdatoms,
+                                        simulationWork,
                                          stepWork);
  
          wallcycle_start_nocount(wcycle, ewcNS);
@@ -1573,12 +1588,6 @@ void do_force(FILE                                     *fplog,
          }
      }
  
-    // TODO move this into StepWorkload
-    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !stepWork.useGpuPmeFReduction;
-    // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
-    const bool haveCpuLocalForces     = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction ||
-                                         (fr->efep != efepNO));
-
      if (havePPDomainDecomposition(cr))
      {
          /* We are done with the CPU compute.
@@ -1593,11 +1602,11 @@ void do_force(FILE                                     *fplog,
  
              if (useGpuForcesHaloExchange)
              {
-                if (haveCpuLocalForces)
+                if (domainWork.haveCpuLocalForceWork)
                  {
                      stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
                  }
-                gpuHaloExchange->communicateHaloForces(haveCpuLocalForces);
+                gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork);
              }
              else
              {
@@ -1673,12 +1682,12 @@ void do_force(FILE                                     *fplog,
  
      // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
      // TODO refoactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePPCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
           */
-        pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd, simulationWork.useGpuPmePPCommunication, stepWork.useGpuPmeFReduction, wcycle);
+        pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd, simulationWork.useGpuPmePpCommunication, stepWork.useGpuPmeFReduction, wcycle);
      }
  
  
@@ -1715,7 +1724,7 @@ void do_force(FILE                                     *fplog,
              // local atoms. This depends on whether there are CPU-based force tasks
              // or when DD is active the halo exchange has resulted in contributions
              // from the non-local part.
-            const bool haveLocalForceContribInCpuBuffer = (haveCpuLocalForces || havePPDomainDecomposition(cr));
+            const bool haveLocalForceContribInCpuBuffer = (domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr));
  
              // TODO: move these steps as early as possible:
              // - CPU f H2D should be as soon as all CPU-side forces are done
@@ -1791,13 +1800,13 @@ void do_force(FILE                                     *fplog,
      }
  
      // TODO refoactor this and unify with above PME-PP GPU communication path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePPCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication)
      {
          /* In case of node-splitting, the PP nodes receive the long-range
           * forces, virial and energy from the PME nodes here.
           */
          pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd,
-                               simulationWork.useGpuPmePPCommunication, false, wcycle);
+                               simulationWork.useGpuPmePpCommunication, false, wcycle);
      }
  
      if (stepWork.computeForces)
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp

index fef7f4850a6af7824b983b2e0cbd804e4e6dbdc6..1002a58373489853039304a4ebd25adc0474d582 100644 (file)
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -63,6 +63,7 @@
  #include "gromacs/essentialdynamics/edsam.h"
  #include "gromacs/ewald/pme.h"
  #include "gromacs/ewald/pme_load_balancing.h"
+#include "gromacs/ewald/pme_pp_comm_gpu.h"
  #include "gromacs/fileio/trxio.h"
  #include "gromacs/gmxlib/network.h"
  #include "gromacs/gmxlib/nrnb.h"
@@ -331,7 +332,7 @@ void gmx::LegacySimulator::do_md()
  //       2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
  //          stream owned by the StatePropagatorDataGpu
      const auto &simulationWork     = runScheduleWork->simulationWork;
-    const bool  useGpuForPme       = simulationWork.usePmeGpu;
+    const bool  useGpuForPme       = simulationWork.useGpuPme;
      const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
      // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
      const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
@@ -749,7 +750,7 @@ void gmx::LegacySimulator::do_md()
              // TODO: Move to after all booleans are defined.
              if (useGpuForUpdate && !bFirstStep)
              {
-                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
+                stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
                  stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
              }
              /* PME grid + cut-off optimization with GPUs or PME nodes */
@@ -825,15 +826,12 @@ void gmx::LegacySimulator::do_md()
                  stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
              }
  
-            // Copy coordinate from the GPU when needed:
-            // - On search steps to keep copy on host (device buffers are reinitialized).
-            // - There are CPU bonded forces that need current coordinates
-            // - When needed for the output.
-            if (bNS ||
-                (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork) ||
-                do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed))
+            // Copy coordinate from the GPU when needed at the search step.
+            // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
+            // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
+            if (bNS)
              {
-                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
+                stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
                  stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
              }
          }
@@ -1164,6 +1162,14 @@ void gmx::LegacySimulator::do_md()
              }
          }
  
+        // Copy coordinate from the GPU for the output if the update is offloaded and
+        // coordinates have not already been copied for i) search or ii) CPU force tasks.
+        if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork &&
+            (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)))
+        {
+            stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
+            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+        }
          /* Now we have the energies and forces corresponding to the
           * coordinates at time t. We must output all of this before
           * the update.
@@ -1284,7 +1290,7 @@ void gmx::LegacySimulator::do_md()
  
                  // Copy data to the GPU after buffers might have being reinitialized
                  stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
-                stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
+                stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
              }
  
              stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::All);
@@ -1308,8 +1314,6 @@ void gmx::LegacySimulator::do_md()
              {
                  stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
                  stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
-                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
              }
          }
          else
@@ -1426,6 +1430,12 @@ void gmx::LegacySimulator::do_md()
  
              if (bGStat || needEkinAtNextStep || doInterSimSignal)
              {
+                // Copy coordinates when needed to stop the CM motion.
+                if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
+                {
+                    stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
+                    stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+                }
                  // Since we're already communicating at this step, we
                  // can propagate intra-simulation signals. Note that
                  // check_nstglobalcomm has the responsibility for
@@ -1462,7 +1472,7 @@ void gmx::LegacySimulator::do_md()
                      // TODO: The special case of removing CM motion should be dealt more gracefully
                      if (useGpuForUpdate)
                      {
-                        stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
+                        stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
                          stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
                      }
                  }
@@ -1714,6 +1724,12 @@ void gmx::LegacySimulator::do_md()
  
      walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
  
+    if (fr->pmePpCommGpu)
+    {
+        // destroy object since it is no longer required. (This needs to be done while the GPU context still exists.)
+        fr->pmePpCommGpu.reset();
+    }
+
      global_stat_destroy(gstat);
  
  }
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index 3b7a417d642cb25cc3ffa90b4fac4b3397ddb4cf..245d6f9e50e838de17edd51e33221e399b5caf93 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -885,6 +885,27 @@ int Mdrunner::mdrunner()
      // and report those features that are enabled.
      const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
  
+    // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
+    //       decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
+    // NOTE: When the simulationWork is constructed, the useGpuForUpdate overrides the devFlags.enableGpuBufferOps.
+    try
+    {
+        useGpuForUpdate = decideWhetherToUseGpuForUpdate(devFlags.forceGpuUpdateDefaultOn,
+                                                         useDomainDecomposition,
+                                                         useGpuForPme,
+                                                         useGpuForNonbonded,
+                                                         updateTarget,
+                                                         gpusWereDetected,
+                                                         *inputrec,
+                                                         gmx_mtop_interaction_count(mtop, IF_VSITE) > 0,
+                                                         doEssentialDynamics,
+                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+                                                         gmx_mtop_ftype_count(mtop, F_DISRES) > 0,
+                                                         replExParams.exchangeInterval > 0);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+
      // Build restraints.
      // TODO: hide restraint implementation details from Mdrunner.
      // There is nothing unique about restraints at this point as far as the
@@ -1333,7 +1354,7 @@ int Mdrunner::mdrunner()
          // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
          if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
          {
-            GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, "Must use GMX_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
+            GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
              void *streamLocal              = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
              void *streamNonLocal           = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
              void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
@@ -1545,30 +1566,6 @@ int Mdrunner::mdrunner()
                              fr->cginfo_mb);
          }
  
-        if (updateTarget == TaskTarget::Gpu)
-        {
-            if (SIMMASTER(cr))
-            {
-                gmx_fatal(FARGS, "It is currently not possible to redirect the calculation "
-                          "of update and constraints to the GPU!");
-            }
-        }
-
-        // Before we start the actual simulator, try if we can run the update task on the GPU.
-        useGpuForUpdate = decideWhetherToUseGpuForUpdate(devFlags.forceGpuUpdateDefaultOn,
-                                                         DOMAINDECOMP(cr),
-                                                         useGpuForPme,
-                                                         useGpuForNonbonded,
-                                                         devFlags.enableGpuBufferOps,
-                                                         updateTarget,
-                                                         gpusWereDetected,
-                                                         *inputrec,
-                                                         mdAtoms->mdatoms()->haveVsites,
-                                                         doEssentialDynamics,
-                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
-                                                         gmx_mtop_ftype_count(mtop, F_DISRES) > 0,
-                                                         replExParams.exchangeInterval > 0);
-
          const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
                      false,
                      inputrec, doRerun, vsite.get(), ms, replExParams,
@@ -1577,8 +1574,21 @@ int Mdrunner::mdrunner()
  
          const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
  
+        // TODO This is not the right place to manage the lifetime of
+        // this data structure, but currently it's the easiest way to
+        // make it work.
+        MdrunScheduleWorkload runScheduleWork;
+        // Also populates the simulation constant workload description.
+        runScheduleWork.simulationWork = createSimulationWorkload(useGpuForNonbonded,
+                                                                  pmeRunMode,
+                                                                  useGpuForBonded,
+                                                                  useGpuForUpdate,
+                                                                  devFlags.enableGpuBufferOps,
+                                                                  devFlags.enableGpuHaloExchange,
+                                                                  devFlags.enableGpuPmePPComm);
+
          std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
-        if (gpusWereDetected && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || devFlags.enableGpuBufferOps))
+        if (gpusWereDetected && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || runScheduleWork.simulationWork.useGpuBufferOps))
          {
              const void         *pmeStream      = pme_gpu_get_device_stream(fr->pmedata);
              const void         *localStream    = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local) : nullptr;
@@ -1596,21 +1606,6 @@ int Mdrunner::mdrunner()
              fr->stateGpu = stateGpu.get();
          }
  
-        // TODO This is not the right place to manage the lifetime of
-        // this data structure, but currently it's the easiest way to
-        // make it work.
-        MdrunScheduleWorkload runScheduleWork;
-        // Also populates the simulation constant workload description.
-        runScheduleWork.simulationWork = createSimulationWorkload(useGpuForNonbonded,
-                                                                  useGpuForPme,
-                                                                  (pmeRunMode == PmeRunMode::GPU),
-                                                                  useGpuForBonded,
-                                                                  useGpuForUpdate,
-                                                                  devFlags.enableGpuBufferOps,
-                                                                  devFlags.enableGpuHaloExchange,
-                                                                  devFlags.enableGpuPmePPComm);
-
-
          GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
          SimulatorBuilder simulatorBuilder;
  
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h

index 147199771cb7ed5d5ff396b7e90fb4c945a9d2c6..6594081c509036399646ee972944d7f6278be54d 100644 (file)
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -116,6 +116,8 @@ class DomainLifetimeWorkload
          bool haveCpuListedForceWork = false;
          //! Whether the current nstlist step-range has special forces on the CPU.
          bool haveSpecialForces = false;
+        //! Whether there are currently any local forces to be computed on the CPU
+        bool haveCpuLocalForceWork = false;
  
          // TODO
          //! Whether the current nstlist step-range Free energy work on the CPU.
@@ -134,12 +136,16 @@ class DomainLifetimeWorkload
  class SimulationWorkload
  {
      public:
+        //! If we have calculation of short range nonbondeds on CPU
+        bool useCpuNonbonded           = false;
          //! If we have calculation of short range nonbondeds on GPU
          bool useGpuNonbonded           = false;
          //! If we have calculation of long range PME in GPU
-        bool usePmeGpu                 = false;
+        bool useCpuPme                 = false;
+        //! If we have calculation of long range PME in GPU
+        bool useGpuPme                 = false;
          //! If PME FFT solving is done on GPU.
-        bool usePmeFftGpu              = false;
+        bool useGpuPmeFft              = false;
          //! If bonded interactions are calculated on GPU.
          bool useGpuBonded              = false;
          //! If update and constraint solving is performed on GPU.
@@ -149,7 +155,7 @@ class SimulationWorkload
          //! If domain decomposition halo exchange is performed on GPU.
          bool useGpuHaloExchange        = false;
          //! If direct PP-PME communication between GPU is used.
-        bool useGpuPmePPCommunication  = false;
+        bool useGpuPmePpCommunication  = false;
          //! If direct GPU-GPU communication is enabled.
          bool useGpuDirectCommunication = false;
  };
diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp

index 71ca58d1584f1f5647bc5b584a0fcf0190337b81..f40814e23a2085be8e551d5283ca6008a3ab5862 100644 (file)
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -494,7 +494,6 @@ bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefaultOn,
                                      const bool        isDomainDecomposition,
                                      const bool        useGpuForPme,
                                      const bool        useGpuForNonbonded,
-                                    const bool        useGpuForBufferOps,
                                      const TaskTarget  updateTarget,
                                      const bool        gpusWereDetected,
                                      const t_inputrec &inputrec,
@@ -516,10 +515,10 @@ bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefaultOn,
      {
          errorMessage += "Domain decomposition is not supported.\n";
      }
-    // Using the GPU-version of update makes sense if forces are already on the GPU, i.e. if at least:
-    // 1. PME is on the GPU (there should be a copy of coordinates on a GPU in rvec format for PME spread).
-    // 2. Non-bonded interactions and buffer ops are on the GPU.
-    if (!(useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps)))
+    // Using the GPU-version of update if:
+    // 1. PME is on the GPU (there should be a copy of coordinates on GPU for PME spread), or
+    // 2. Non-bonded interactions are on the GPU.
+    if (!(useGpuForPme || useGpuForNonbonded))
      {
          errorMessage += "Either PME or short-ranged non-bonded interaction tasks must run on the GPU.\n";
      }
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h

index c74ca0a197ce96071b0a9001a8b7152e0fa4f39c..6b8685fa3e1f886e029915a959e6143d798840c7 100644 (file)
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -235,7 +235,6 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
   * \param[in]  isDomainDecomposition     Whether there more than one domain.
   * \param[in]  useGpuForPme              Whether GPUs will be used for PME interactions.
   * \param[in]  useGpuForNonbonded        Whether GPUs will be used for nonbonded interactions.
- * \param[in]  useGpuForBufferOps        Whether GPUs will be used for buffer operations.
   * \param[in]  updateTarget              User choice for running simulation on GPU.
   * \param[in]  gpusWereDetected          Whether compatible GPUs were detected on any node.
   * \param[in]  inputrec                  The user input.
@@ -253,7 +252,6 @@ bool decideWhetherToUseGpuForUpdate(bool              forceGpuUpdateDefaultOn,
                                      bool              isDomainDecomposition,
                                      bool              useGpuForPme,
                                      bool              useGpuForNonbonded,
-                                    bool              useGpuForBufferOps,
                                      TaskTarget        updateTarget,
                                      bool              gpusWereDetected,
                                      const t_inputrec &inputrec,
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.cpp b/src/gromacs/taskassignment/decidesimulationworkload.cpp

index 17c3dc06af349f9a5620d797952335ba15cb712a..ea82f95f42bccd1eef0cef622a3d343a0204ec28 100644 (file)
--- a/src/gromacs/taskassignment/decidesimulationworkload.cpp
+++ b/src/gromacs/taskassignment/decidesimulationworkload.cpp
@@ -43,32 +43,33 @@
  
  #include "decidesimulationworkload.h"
  
+#include "gromacs/ewald/pme.h"
  #include "gromacs/taskassignment/taskassignment.h"
  #include "gromacs/utility/arrayref.h"
  
  namespace gmx
  {
  
-SimulationWorkload createSimulationWorkload(bool useGpuForNonbonded,
-                                            bool useGpuForPme,
-                                            bool useGpuForPmeFft,
-                                            bool useGpuForBonded,
-                                            bool useGpuForUpdateConstraints,
-                                            bool useGpuForBufferOps,
-                                            bool useGpuHaloExchange,
-                                            bool useGpuPmePpComm)
+SimulationWorkload createSimulationWorkload(bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdate,
+                                            bool       useGpuForBufferOps,
+                                            bool       useGpuHaloExchange,
+                                            bool       useGpuPmePpComm)
  {
-    SimulationWorkload simulationWorkload {
-        useGpuForNonbonded,
-        useGpuForPme,
-        useGpuForPmeFft,
-        useGpuForBonded,
-        useGpuForUpdateConstraints,
-        useGpuForBufferOps,
-        useGpuHaloExchange,
-        useGpuPmePpComm,
-        useGpuHaloExchange || useGpuPmePpComm
-    };
+    SimulationWorkload simulationWorkload;
+    simulationWorkload.useCpuNonbonded           = !useGpuForNonbonded;
+    simulationWorkload.useGpuNonbonded           = useGpuForNonbonded;
+    simulationWorkload.useCpuPme                 = (pmeRunMode == PmeRunMode::CPU);
+    simulationWorkload.useGpuPme                 = (pmeRunMode == PmeRunMode::GPU || pmeRunMode == PmeRunMode::Mixed);
+    simulationWorkload.useGpuPmeFft              = (pmeRunMode == PmeRunMode::Mixed);
+    simulationWorkload.useGpuBonded              = useGpuForBonded;
+    simulationWorkload.useGpuUpdate              = useGpuForUpdate;
+    simulationWorkload.useGpuBufferOps           = useGpuForBufferOps || useGpuForUpdate;
+    simulationWorkload.useGpuHaloExchange        = useGpuHaloExchange;
+    simulationWorkload.useGpuPmePpCommunication  = useGpuPmePpComm;
+    simulationWorkload.useGpuDirectCommunication = useGpuHaloExchange || useGpuPmePpComm;
  
      return simulationWorkload;
  }
diff --git a/src/gromacs/taskassignment/decidesimulationworkload.h b/src/gromacs/taskassignment/decidesimulationworkload.h

index 664a307b93c632fccd0de1f0e1a5701a1be7adef..b389da53c168a36d98396f004d532f196f800a17 100644 (file)
--- a/src/gromacs/taskassignment/decidesimulationworkload.h
+++ b/src/gromacs/taskassignment/decidesimulationworkload.h
@@ -46,6 +46,8 @@
  
  #include "gromacs/mdtypes/simulation_workload.h"
  
+enum class PmeRunMode;
+
  namespace gmx
  {
  
@@ -55,24 +57,22 @@ namespace gmx
   *
   * \param[in] useGpuForNonbonded If we have short-range nonbonded interactions
   *                               calculations on GPU(s).
- * \param[in] useGpuForPme       If long range PME interactions are calculated on GPU(s).
- * \param[in] useGpuForPmeFft    If FFT solving for PME is done on the GPU.
+ * \param[in] pmeRunMode         Run mode indicating what resource is PME execured on.
   * \param[in] useGpuForBonded    If bonded interactions are calculated on GPU(s).
- * \param[in] useGpuForUpdateConstraints If coordinate update and constraint solving is performed on
- *                                       GPU(s).
+ * \param[in] useGpuForUpdate    If coordinate update and constraint solving is performed on
+ *                               GPU(s).
   * \param[in] useGpuForBufferOps If buffer ops / reduction are calculated on GPU(s).
   * \param[in] useGpuHaloExchange If GPU direct communication is used in halo exchange.
   * \param[in] useGpuPmePpComm    If GPu direct communication is used in PME-PP communication.
   * \returns Simulation lifetime constant workload description.
   */
-SimulationWorkload createSimulationWorkload(bool useGpuForNonbonded,
-                                            bool useGpuForPme,
-                                            bool useGpuForPmeFft,
-                                            bool useGpuForBonded,
-                                            bool useGpuForUpdateConstraints,
-                                            bool useGpuForBufferOps,
-                                            bool useGpuHaloExchange,
-                                            bool useGpuPmePpComm);
+SimulationWorkload createSimulationWorkload(bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdate,
+                                            bool       useGpuForBufferOps,
+                                            bool       useGpuHaloExchange,
+                                            bool       useGpuPmePpComm);
  
  
  }  // namespace gmx
author	Paul Bauer <paul.bauer.q@gmail.com>
	Tue, 29 Oct 2019 13:40:41 +0000 (14:40 +0100)
committer	Paul Bauer <paul.bauer.q@gmail.com>
	Tue, 29 Oct 2019 13:40:41 +0000 (14:40 +0100)
admin/builds/gromacs.py		patch \| blob \| history
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/md.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/simulation_workload.h		patch \| blob \| history
src/gromacs/taskassignment/decidegpuusage.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidegpuusage.h		patch \| blob \| history
src/gromacs/taskassignment/decidesimulationworkload.cpp		patch \| blob \| history
src/gromacs/taskassignment/decidesimulationworkload.h		patch \| blob \| history