Merge "Merge branch 'release-2019' into master"
authorPaul Bauer <paul.bauer.q@gmail.com>
Tue, 29 Oct 2019 13:40:41 +0000 (14:40 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Tue, 29 Oct 2019 13:40:41 +0000 (14:40 +0100)
admin/builds/gromacs.py
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/md.cpp
src/gromacs/mdrun/runner.cpp
src/gromacs/mdtypes/simulation_workload.h
src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/decidegpuusage.h
src/gromacs/taskassignment/decidesimulationworkload.cpp
src/gromacs/taskassignment/decidesimulationworkload.h

index 4fa518f1864865c2b4d442d5e6b9d8d9d9e87590..49007c5a63f125a098c719b7e73bf4b1b005d2e2 100644 (file)
@@ -185,7 +185,6 @@ def do_build(context):
 
     # GPU update flag enables GPU update+constraints as well as buffer ops (dependency)
     if context.opts.gpuupdate:
-        context.env.set_env_var('GMX_USE_GPU_BUFFER_OPS', "1")
         context.env.set_env_var('GMX_FORCE_UPDATE_DEFAULT_GPU', "1")
 
     regressiontests_path = context.workspace.get_project_dir(Project.REGRESSIONTESTS)
index f9fd797164b6bb6281909cccc4d494d23e39827c..5962a06bb771b313781c6789fa8f21e4ea5c0a1e 100644 (file)
@@ -777,14 +777,15 @@ setupForceOutputs(t_forcerec                          *fr,
 /*! \brief Set up flags that have the lifetime of the domain indicating what type of work is there to compute.
  */
 static DomainLifetimeWorkload
-setupDomainLifetimeWorkload(const t_inputrec       &inputrec,
-                            const t_forcerec       &fr,
-                            const pull_t           *pull_work,
-                            const gmx_edsam        *ed,
-                            const t_idef           &idef,
-                            const t_fcdata         &fcd,
-                            const t_mdatoms        &mdatoms,
-                            const StepWorkload     &stepWork)
+setupDomainLifetimeWorkload(const t_inputrec         &inputrec,
+                            const t_forcerec         &fr,
+                            const pull_t             *pull_work,
+                            const gmx_edsam          *ed,
+                            const t_idef             &idef,
+                            const t_fcdata           &fcd,
+                            const t_mdatoms          &mdatoms,
+                            const SimulationWorkload &simulationWork,
+                            const StepWorkload       &stepWork)
 {
     DomainLifetimeWorkload domainWork;
     // Note that haveSpecialForces is constant over the whole run
@@ -795,6 +796,10 @@ setupDomainLifetimeWorkload(const t_inputrec       &inputrec,
     domainWork.haveCpuListedForceWork = haveCpuListedForces(fr, idef, fcd);
     // Note that haveFreeEnergyWork is constant over the whole run
     domainWork.haveFreeEnergyWork     = (fr.efep != efepNO && mdatoms.nPerturbed != 0);
+    // We assume we have local force work if there are CPU
+    // force tasks including PME or nonbondeds.
+    domainWork.haveCpuLocalForceWork  = domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || domainWork.haveFreeEnergyWork ||
+        simulationWork.useCpuNonbonded || simulationWork.useCpuPme;
     return domainWork;
 }
 
@@ -832,8 +837,8 @@ setupStepWorkload(const int                 legacyFlags,
     // on virial steps the CPU reduction path is taken
     // TODO: remove flags.computeEnergy, ref #3128
     flags.useGpuFBufferOps    = simulationWork.useGpuBufferOps && !(flags.computeVirial || flags.computeEnergy);
-    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.usePmeGpu &&
-                                                           (rankHasPmeDuty || simulationWork.useGpuPmePPCommunication));
+    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.useGpuPme &&
+                                                           (rankHasPmeDuty || simulationWork.useGpuPmePpCommunication));
 
     return flags;
 }
@@ -943,7 +948,7 @@ void do_force(FILE                                     *fplog,
     const StepWorkload &stepWork = runScheduleWork->stepWork;
 
 
-    const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
+    const bool useGpuPmeOnThisRank = simulationWork.useGpuPme && thisRankHasDuty(cr, DUTY_PME);
     const int  pmeFlags            = makePmeFlags(stepWork);
 
     // Switches on whether to use GPU for position and force buffer operations
@@ -1029,6 +1034,15 @@ void do_force(FILE                                     *fplog,
         }
     }
 
+    // Copy coordinate from the GPU if update is on the GPU and there are forces to be computed on the CPU. At search steps the
+    // current coordinates are already on the host, hence copy is not needed.
+    if (simulationWork.useGpuUpdate && !stepWork.doNeighborSearch &&
+        runScheduleWork->domainWork.haveCpuLocalForceWork)
+    {
+        stateGpu->copyCoordinatesFromGpu(x.unpaddedArrayRef(), AtomLocality::Local);
+        stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+    }
+
 #if GMX_MPI
     if (!thisRankHasDuty(cr, DUTY_PME))
     {
@@ -1037,12 +1051,12 @@ void do_force(FILE                                     *fplog,
          * and domain decomposition does not use the graph,
          * we do not need to worry about shifting.
          */
-        bool reinitGpuPmePpComms    = simulationWork.useGpuPmePPCommunication && (stepWork.doNeighborSearch);
-        bool sendCoordinatesFromGpu = simulationWork.useGpuPmePPCommunication && !(stepWork.doNeighborSearch);
+        bool reinitGpuPmePpComms    = simulationWork.useGpuPmePpCommunication && (stepWork.doNeighborSearch);
+        bool sendCoordinatesFromGpu = simulationWork.useGpuPmePpCommunication && !(stepWork.doNeighborSearch);
         gmx_pme_send_coordinates(fr, cr, box, as_rvec_array(x.unpaddedArrayRef().data()),
                                  lambda[efptCOUL], lambda[efptVDW],
                                  (stepWork.computeVirial || stepWork.computeEnergy),
-                                 step, simulationWork.useGpuPmePPCommunication, reinitGpuPmePpComms,
+                                 step, simulationWork.useGpuPmePpCommunication, reinitGpuPmePpComms,
                                  sendCoordinatesFromGpu, wcycle);
     }
 #endif /* GMX_MPI */
@@ -1140,6 +1154,7 @@ void do_force(FILE                                     *fplog,
                                         top->idef,
                                         *fcd,
                                         *mdatoms,
+                                        simulationWork,
                                         stepWork);
 
         wallcycle_start_nocount(wcycle, ewcNS);
@@ -1573,12 +1588,6 @@ void do_force(FILE                                     *fplog,
         }
     }
 
-    // TODO move this into StepWorkload
-    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !stepWork.useGpuPmeFReduction;
-    // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
-    const bool haveCpuLocalForces     = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction ||
-                                         (fr->efep != efepNO));
-
     if (havePPDomainDecomposition(cr))
     {
         /* We are done with the CPU compute.
@@ -1593,11 +1602,11 @@ void do_force(FILE                                     *fplog,
 
             if (useGpuForcesHaloExchange)
             {
-                if (haveCpuLocalForces)
+                if (domainWork.haveCpuLocalForceWork)
                 {
                     stateGpu->copyForcesToGpu(forceOut.forceWithShiftForces().force(), AtomLocality::Local);
                 }
-                gpuHaloExchange->communicateHaloForces(haveCpuLocalForces);
+                gpuHaloExchange->communicateHaloForces(domainWork.haveCpuLocalForceWork);
             }
             else
             {
@@ -1673,12 +1682,12 @@ void do_force(FILE                                     *fplog,
 
     // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
     // TODO refoactor this and unify with below default-path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePPCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && simulationWork.useGpuPmePpCommunication)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
          */
-        pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd, simulationWork.useGpuPmePPCommunication, stepWork.useGpuPmeFReduction, wcycle);
+        pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd, simulationWork.useGpuPmePpCommunication, stepWork.useGpuPmeFReduction, wcycle);
     }
 
 
@@ -1715,7 +1724,7 @@ void do_force(FILE                                     *fplog,
             // local atoms. This depends on whether there are CPU-based force tasks
             // or when DD is active the halo exchange has resulted in contributions
             // from the non-local part.
-            const bool haveLocalForceContribInCpuBuffer = (haveCpuLocalForces || havePPDomainDecomposition(cr));
+            const bool haveLocalForceContribInCpuBuffer = (domainWork.haveCpuLocalForceWork || havePPDomainDecomposition(cr));
 
             // TODO: move these steps as early as possible:
             // - CPU f H2D should be as soon as all CPU-side forces are done
@@ -1791,13 +1800,13 @@ void do_force(FILE                                     *fplog,
     }
 
     // TODO refoactor this and unify with above PME-PP GPU communication path call to the same function
-    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePPCommunication)
+    if (PAR(cr) && !thisRankHasDuty(cr, DUTY_PME) && !simulationWork.useGpuPmePpCommunication)
     {
         /* In case of node-splitting, the PP nodes receive the long-range
          * forces, virial and energy from the PME nodes here.
          */
         pme_receive_force_ener(fr, cr, &forceOut.forceWithVirial(), enerd,
-                               simulationWork.useGpuPmePPCommunication, false, wcycle);
+                               simulationWork.useGpuPmePpCommunication, false, wcycle);
     }
 
     if (stepWork.computeForces)
index fef7f4850a6af7824b983b2e0cbd804e4e6dbdc6..1002a58373489853039304a4ebd25adc0474d582 100644 (file)
@@ -63,6 +63,7 @@
 #include "gromacs/essentialdynamics/edsam.h"
 #include "gromacs/ewald/pme.h"
 #include "gromacs/ewald/pme_load_balancing.h"
+#include "gromacs/ewald/pme_pp_comm_gpu.h"
 #include "gromacs/fileio/trxio.h"
 #include "gromacs/gmxlib/network.h"
 #include "gromacs/gmxlib/nrnb.h"
@@ -331,7 +332,7 @@ void gmx::LegacySimulator::do_md()
 //       2. The proper GPU syncronization is introduced, so that the H2D and D2H data copies can be performed in the separate
 //          stream owned by the StatePropagatorDataGpu
     const auto &simulationWork     = runScheduleWork->simulationWork;
-    const bool  useGpuForPme       = simulationWork.usePmeGpu;
+    const bool  useGpuForPme       = simulationWork.useGpuPme;
     const bool  useGpuForNonbonded = simulationWork.useGpuNonbonded;
     // Temporary solution to make sure that the buffer ops are offloaded when update is offloaded
     const bool  useGpuForBufferOps = simulationWork.useGpuBufferOps;
@@ -749,7 +750,7 @@ void gmx::LegacySimulator::do_md()
             // TODO: Move to after all booleans are defined.
             if (useGpuForUpdate && !bFirstStep)
             {
-                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
+                stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
             }
             /* PME grid + cut-off optimization with GPUs or PME nodes */
@@ -825,15 +826,12 @@ void gmx::LegacySimulator::do_md()
                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
             }
 
-            // Copy coordinate from the GPU when needed:
-            // - On search steps to keep copy on host (device buffers are reinitialized).
-            // - There are CPU bonded forces that need current coordinates
-            // - When needed for the output.
-            if (bNS ||
-                (runScheduleWork->domainWork.haveCpuBondedWork || runScheduleWork->domainWork.haveFreeEnergyWork) ||
-                do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed))
+            // Copy coordinate from the GPU when needed at the search step.
+            // NOTE: The cases when coordinates needed on CPU for force evaluation are handled in sim_utils.
+            // NOTE: If the coordinates are to be written into output file they are also copied separately before the output.
+            if (bNS)
             {
-                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
+                stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
                 stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
             }
         }
@@ -1164,6 +1162,14 @@ void gmx::LegacySimulator::do_md()
             }
         }
 
+        // Copy coordinate from the GPU for the output if the update is offloaded and
+        // coordinates have not already been copied for i) search or ii) CPU force tasks.
+        if (useGpuForUpdate && !bNS && !runScheduleWork->domainWork.haveCpuLocalForceWork &&
+            (do_per_step(step, ir->nstxout) || do_per_step(step, ir->nstxout_compressed)))
+        {
+            stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
+            stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+        }
         /* Now we have the energies and forces corresponding to the
          * coordinates at time t. We must output all of this before
          * the update.
@@ -1284,7 +1290,7 @@ void gmx::LegacySimulator::do_md()
 
                 // Copy data to the GPU after buffers might have being reinitialized
                 stateGpu->copyVelocitiesToGpu(state->v, AtomLocality::Local);
-                stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
+                stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
             }
 
             stateGpu->copyForcesToGpu(ArrayRef<RVec>(f), AtomLocality::All);
@@ -1308,8 +1314,6 @@ void gmx::LegacySimulator::do_md()
             {
                 stateGpu->copyVelocitiesFromGpu(state->v, AtomLocality::Local);
                 stateGpu->waitVelocitiesReadyOnHost(AtomLocality::Local);
-                stateGpu->copyCoordinatesFromGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
-                stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
             }
         }
         else
@@ -1426,6 +1430,12 @@ void gmx::LegacySimulator::do_md()
 
             if (bGStat || needEkinAtNextStep || doInterSimSignal)
             {
+                // Copy coordinates when needed to stop the CM motion.
+                if (useGpuForUpdate && !EI_VV(ir->eI) && bStopCM)
+                {
+                    stateGpu->copyCoordinatesFromGpu(state->x, AtomLocality::Local);
+                    stateGpu->waitCoordinatesReadyOnHost(AtomLocality::Local);
+                }
                 // Since we're already communicating at this step, we
                 // can propagate intra-simulation signals. Note that
                 // check_nstglobalcomm has the responsibility for
@@ -1462,7 +1472,7 @@ void gmx::LegacySimulator::do_md()
                     // TODO: The special case of removing CM motion should be dealt more gracefully
                     if (useGpuForUpdate)
                     {
-                        stateGpu->copyCoordinatesToGpu(ArrayRef<RVec>(state->x), AtomLocality::Local);
+                        stateGpu->copyCoordinatesToGpu(state->x, AtomLocality::Local);
                         stateGpu->waitCoordinatesCopiedToDevice(AtomLocality::Local);
                     }
                 }
@@ -1714,6 +1724,12 @@ void gmx::LegacySimulator::do_md()
 
     walltime_accounting_set_nsteps_done(walltime_accounting, step_rel);
 
+    if (fr->pmePpCommGpu)
+    {
+        // destroy object since it is no longer required. (This needs to be done while the GPU context still exists.)
+        fr->pmePpCommGpu.reset();
+    }
+
     global_stat_destroy(gstat);
 
 }
index 3b7a417d642cb25cc3ffa90b4fac4b3397ddb4cf..245d6f9e50e838de17edd51e33221e399b5caf93 100644 (file)
@@ -885,6 +885,27 @@ int Mdrunner::mdrunner()
     // and report those features that are enabled.
     const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog, useGpuForNonbonded, useGpuForPme);
 
+    // NOTE: The devFlags need decideWhetherToUseGpusForNonbonded(...) and decideWhetherToUseGpusForPme(...) for overrides,
+    //       decideWhetherToUseGpuForUpdate() needs devFlags for the '-update auto' override, hence the interleaving.
+    // NOTE: When the simulationWork is constructed, the useGpuForUpdate overrides the devFlags.enableGpuBufferOps.
+    try
+    {
+        useGpuForUpdate = decideWhetherToUseGpuForUpdate(devFlags.forceGpuUpdateDefaultOn,
+                                                         useDomainDecomposition,
+                                                         useGpuForPme,
+                                                         useGpuForNonbonded,
+                                                         updateTarget,
+                                                         gpusWereDetected,
+                                                         *inputrec,
+                                                         gmx_mtop_interaction_count(mtop, IF_VSITE) > 0,
+                                                         doEssentialDynamics,
+                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+                                                         gmx_mtop_ftype_count(mtop, F_DISRES) > 0,
+                                                         replExParams.exchangeInterval > 0);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
+
+
     // Build restraints.
     // TODO: hide restraint implementation details from Mdrunner.
     // There is nothing unique about restraints at this point as far as the
@@ -1333,7 +1354,7 @@ int Mdrunner::mdrunner()
         // TODO remove need to pass local stream into GPU halo exchange - Redmine #3093
         if (havePPDomainDecomposition(cr) && prefer1DAnd1PulseDD && is1DAnd1PulseDD(*cr->dd))
         {
-            GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, "Must use GMX_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
+            GMX_RELEASE_ASSERT(devFlags.enableGpuBufferOps, "Must use GMX_USE_GPU_BUFFER_OPS=1 to use GMX_GPU_DD_COMMS=1");
             void *streamLocal              = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local);
             void *streamNonLocal           = Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::NonLocal);
             void *coordinatesOnDeviceEvent = fr->nbv->get_x_on_device_event();
@@ -1545,30 +1566,6 @@ int Mdrunner::mdrunner()
                             fr->cginfo_mb);
         }
 
-        if (updateTarget == TaskTarget::Gpu)
-        {
-            if (SIMMASTER(cr))
-            {
-                gmx_fatal(FARGS, "It is currently not possible to redirect the calculation "
-                          "of update and constraints to the GPU!");
-            }
-        }
-
-        // Before we start the actual simulator, try if we can run the update task on the GPU.
-        useGpuForUpdate = decideWhetherToUseGpuForUpdate(devFlags.forceGpuUpdateDefaultOn,
-                                                         DOMAINDECOMP(cr),
-                                                         useGpuForPme,
-                                                         useGpuForNonbonded,
-                                                         devFlags.enableGpuBufferOps,
-                                                         updateTarget,
-                                                         gpusWereDetected,
-                                                         *inputrec,
-                                                         mdAtoms->mdatoms()->haveVsites,
-                                                         doEssentialDynamics,
-                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
-                                                         gmx_mtop_ftype_count(mtop, F_DISRES) > 0,
-                                                         replExParams.exchangeInterval > 0);
-
         const bool inputIsCompatibleWithModularSimulator = ModularSimulator::isInputCompatible(
                     false,
                     inputrec, doRerun, vsite.get(), ms, replExParams,
@@ -1577,8 +1574,21 @@ int Mdrunner::mdrunner()
 
         const bool useModularSimulator = inputIsCompatibleWithModularSimulator && !(getenv("GMX_DISABLE_MODULAR_SIMULATOR") != nullptr);
 
+        // TODO This is not the right place to manage the lifetime of
+        // this data structure, but currently it's the easiest way to
+        // make it work.
+        MdrunScheduleWorkload runScheduleWork;
+        // Also populates the simulation constant workload description.
+        runScheduleWork.simulationWork = createSimulationWorkload(useGpuForNonbonded,
+                                                                  pmeRunMode,
+                                                                  useGpuForBonded,
+                                                                  useGpuForUpdate,
+                                                                  devFlags.enableGpuBufferOps,
+                                                                  devFlags.enableGpuHaloExchange,
+                                                                  devFlags.enableGpuPmePPComm);
+
         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
-        if (gpusWereDetected && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || devFlags.enableGpuBufferOps))
+        if (gpusWereDetected && ((useGpuForPme && thisRankHasDuty(cr, DUTY_PME)) || runScheduleWork.simulationWork.useGpuBufferOps))
         {
             const void         *pmeStream      = pme_gpu_get_device_stream(fr->pmedata);
             const void         *localStream    = fr->nbv->gpu_nbv != nullptr ? Nbnxm::gpu_get_command_stream(fr->nbv->gpu_nbv, InteractionLocality::Local) : nullptr;
@@ -1596,21 +1606,6 @@ int Mdrunner::mdrunner()
             fr->stateGpu = stateGpu.get();
         }
 
-        // TODO This is not the right place to manage the lifetime of
-        // this data structure, but currently it's the easiest way to
-        // make it work.
-        MdrunScheduleWorkload runScheduleWork;
-        // Also populates the simulation constant workload description.
-        runScheduleWork.simulationWork = createSimulationWorkload(useGpuForNonbonded,
-                                                                  useGpuForPme,
-                                                                  (pmeRunMode == PmeRunMode::GPU),
-                                                                  useGpuForBonded,
-                                                                  useGpuForUpdate,
-                                                                  devFlags.enableGpuBufferOps,
-                                                                  devFlags.enableGpuHaloExchange,
-                                                                  devFlags.enableGpuPmePPComm);
-
-
         GMX_ASSERT(stopHandlerBuilder_, "Runner must provide StopHandlerBuilder to simulator.");
         SimulatorBuilder simulatorBuilder;
 
index 147199771cb7ed5d5ff396b7e90fb4c945a9d2c6..6594081c509036399646ee972944d7f6278be54d 100644 (file)
@@ -116,6 +116,8 @@ class DomainLifetimeWorkload
         bool haveCpuListedForceWork = false;
         //! Whether the current nstlist step-range has special forces on the CPU.
         bool haveSpecialForces = false;
+        //! Whether there are currently any local forces to be computed on the CPU
+        bool haveCpuLocalForceWork = false;
 
         // TODO
         //! Whether the current nstlist step-range Free energy work on the CPU.
@@ -134,12 +136,16 @@ class DomainLifetimeWorkload
 class SimulationWorkload
 {
     public:
+        //! If we have calculation of short range nonbondeds on CPU
+        bool useCpuNonbonded           = false;
         //! If we have calculation of short range nonbondeds on GPU
         bool useGpuNonbonded           = false;
         //! If we have calculation of long range PME in GPU
-        bool usePmeGpu                 = false;
+        bool useCpuPme                 = false;
+        //! If we have calculation of long range PME in GPU
+        bool useGpuPme                 = false;
         //! If PME FFT solving is done on GPU.
-        bool usePmeFftGpu              = false;
+        bool useGpuPmeFft              = false;
         //! If bonded interactions are calculated on GPU.
         bool useGpuBonded              = false;
         //! If update and constraint solving is performed on GPU.
@@ -149,7 +155,7 @@ class SimulationWorkload
         //! If domain decomposition halo exchange is performed on GPU.
         bool useGpuHaloExchange        = false;
         //! If direct PP-PME communication between GPU is used.
-        bool useGpuPmePPCommunication  = false;
+        bool useGpuPmePpCommunication  = false;
         //! If direct GPU-GPU communication is enabled.
         bool useGpuDirectCommunication = false;
 };
index 71ca58d1584f1f5647bc5b584a0fcf0190337b81..f40814e23a2085be8e551d5283ca6008a3ab5862 100644 (file)
@@ -494,7 +494,6 @@ bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefaultOn,
                                     const bool        isDomainDecomposition,
                                     const bool        useGpuForPme,
                                     const bool        useGpuForNonbonded,
-                                    const bool        useGpuForBufferOps,
                                     const TaskTarget  updateTarget,
                                     const bool        gpusWereDetected,
                                     const t_inputrec &inputrec,
@@ -516,10 +515,10 @@ bool decideWhetherToUseGpuForUpdate(const bool        forceGpuUpdateDefaultOn,
     {
         errorMessage += "Domain decomposition is not supported.\n";
     }
-    // Using the GPU-version of update makes sense if forces are already on the GPU, i.e. if at least:
-    // 1. PME is on the GPU (there should be a copy of coordinates on a GPU in rvec format for PME spread).
-    // 2. Non-bonded interactions and buffer ops are on the GPU.
-    if (!(useGpuForPme || (useGpuForNonbonded && useGpuForBufferOps)))
+    // Using the GPU-version of update if:
+    // 1. PME is on the GPU (there should be a copy of coordinates on GPU for PME spread), or
+    // 2. Non-bonded interactions are on the GPU.
+    if (!(useGpuForPme || useGpuForNonbonded))
     {
         errorMessage += "Either PME or short-ranged non-bonded interaction tasks must run on the GPU.\n";
     }
index c74ca0a197ce96071b0a9001a8b7152e0fa4f39c..6b8685fa3e1f886e029915a959e6143d798840c7 100644 (file)
@@ -235,7 +235,6 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
  * \param[in]  isDomainDecomposition     Whether there more than one domain.
  * \param[in]  useGpuForPme              Whether GPUs will be used for PME interactions.
  * \param[in]  useGpuForNonbonded        Whether GPUs will be used for nonbonded interactions.
- * \param[in]  useGpuForBufferOps        Whether GPUs will be used for buffer operations.
  * \param[in]  updateTarget              User choice for running simulation on GPU.
  * \param[in]  gpusWereDetected          Whether compatible GPUs were detected on any node.
  * \param[in]  inputrec                  The user input.
@@ -253,7 +252,6 @@ bool decideWhetherToUseGpuForUpdate(bool              forceGpuUpdateDefaultOn,
                                     bool              isDomainDecomposition,
                                     bool              useGpuForPme,
                                     bool              useGpuForNonbonded,
-                                    bool              useGpuForBufferOps,
                                     TaskTarget        updateTarget,
                                     bool              gpusWereDetected,
                                     const t_inputrec &inputrec,
index 17c3dc06af349f9a5620d797952335ba15cb712a..ea82f95f42bccd1eef0cef622a3d343a0204ec28 100644 (file)
 
 #include "decidesimulationworkload.h"
 
+#include "gromacs/ewald/pme.h"
 #include "gromacs/taskassignment/taskassignment.h"
 #include "gromacs/utility/arrayref.h"
 
 namespace gmx
 {
 
-SimulationWorkload createSimulationWorkload(bool useGpuForNonbonded,
-                                            bool useGpuForPme,
-                                            bool useGpuForPmeFft,
-                                            bool useGpuForBonded,
-                                            bool useGpuForUpdateConstraints,
-                                            bool useGpuForBufferOps,
-                                            bool useGpuHaloExchange,
-                                            bool useGpuPmePpComm)
+SimulationWorkload createSimulationWorkload(bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdate,
+                                            bool       useGpuForBufferOps,
+                                            bool       useGpuHaloExchange,
+                                            bool       useGpuPmePpComm)
 {
-    SimulationWorkload simulationWorkload {
-        useGpuForNonbonded,
-        useGpuForPme,
-        useGpuForPmeFft,
-        useGpuForBonded,
-        useGpuForUpdateConstraints,
-        useGpuForBufferOps,
-        useGpuHaloExchange,
-        useGpuPmePpComm,
-        useGpuHaloExchange || useGpuPmePpComm
-    };
+    SimulationWorkload simulationWorkload;
+    simulationWorkload.useCpuNonbonded           = !useGpuForNonbonded;
+    simulationWorkload.useGpuNonbonded           = useGpuForNonbonded;
+    simulationWorkload.useCpuPme                 = (pmeRunMode == PmeRunMode::CPU);
+    simulationWorkload.useGpuPme                 = (pmeRunMode == PmeRunMode::GPU || pmeRunMode == PmeRunMode::Mixed);
+    simulationWorkload.useGpuPmeFft              = (pmeRunMode == PmeRunMode::Mixed);
+    simulationWorkload.useGpuBonded              = useGpuForBonded;
+    simulationWorkload.useGpuUpdate              = useGpuForUpdate;
+    simulationWorkload.useGpuBufferOps           = useGpuForBufferOps || useGpuForUpdate;
+    simulationWorkload.useGpuHaloExchange        = useGpuHaloExchange;
+    simulationWorkload.useGpuPmePpCommunication  = useGpuPmePpComm;
+    simulationWorkload.useGpuDirectCommunication = useGpuHaloExchange || useGpuPmePpComm;
 
     return simulationWorkload;
 }
index 664a307b93c632fccd0de1f0e1a5701a1be7adef..b389da53c168a36d98396f004d532f196f800a17 100644 (file)
@@ -46,6 +46,8 @@
 
 #include "gromacs/mdtypes/simulation_workload.h"
 
+enum class PmeRunMode;
+
 namespace gmx
 {
 
@@ -55,24 +57,22 @@ namespace gmx
  *
  * \param[in] useGpuForNonbonded If we have short-range nonbonded interactions
  *                               calculations on GPU(s).
- * \param[in] useGpuForPme       If long range PME interactions are calculated on GPU(s).
- * \param[in] useGpuForPmeFft    If FFT solving for PME is done on the GPU.
+ * \param[in] pmeRunMode         Run mode indicating what resource is PME execured on.
  * \param[in] useGpuForBonded    If bonded interactions are calculated on GPU(s).
- * \param[in] useGpuForUpdateConstraints If coordinate update and constraint solving is performed on
- *                                       GPU(s).
+ * \param[in] useGpuForUpdate    If coordinate update and constraint solving is performed on
+ *                               GPU(s).
  * \param[in] useGpuForBufferOps If buffer ops / reduction are calculated on GPU(s).
  * \param[in] useGpuHaloExchange If GPU direct communication is used in halo exchange.
  * \param[in] useGpuPmePpComm    If GPu direct communication is used in PME-PP communication.
  * \returns Simulation lifetime constant workload description.
  */
-SimulationWorkload createSimulationWorkload(bool useGpuForNonbonded,
-                                            bool useGpuForPme,
-                                            bool useGpuForPmeFft,
-                                            bool useGpuForBonded,
-                                            bool useGpuForUpdateConstraints,
-                                            bool useGpuForBufferOps,
-                                            bool useGpuHaloExchange,
-                                            bool useGpuPmePpComm);
+SimulationWorkload createSimulationWorkload(bool       useGpuForNonbonded,
+                                            PmeRunMode pmeRunMode,
+                                            bool       useGpuForBonded,
+                                            bool       useGpuForUpdate,
+                                            bool       useGpuForBufferOps,
+                                            bool       useGpuHaloExchange,
+                                            bool       useGpuPmePpComm);
 
 
 }  // namespace gmx