From: Mark Abraham <mark.j.abraham@gmail.com>
Date: Tue, 5 Dec 2017 09:27:21 +0000 (+1100)
Subject: Fix mdrun -nb auto -pme auto when GPUs are absent
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=a0d89bf8e412554a58e803715847b0f9dd920886;p=alexxy%2Fgromacs.git

Fix mdrun -nb auto -pme auto when GPUs are absent

The logic was flawed such that GPUs were "selected" for use even
though none had been detected. That led to the GPU behaviour of
avoiding using separate PME ranks.

Also made a minor fix to the logic for emulation. The new
interpretation of mdrun -gpu_id does not need to trigger an error when
GPU IDs have been supplied along with the emulation environmnet
variable.

Fixes #2315

Change-Id: I68da27c9bfef9f73b9dae4f04f196066d2efb1e2
---

diff --git a/src/gromacs/taskassignment/decidegpuusage.cpp b/src/gromacs/taskassignment/decidegpuusage.cpp
index 6002bb913b..d5132add21 100644
--- a/src/gromacs/taskassignment/decidegpuusage.cpp
+++ b/src/gromacs/taskassignment/decidegpuusage.cpp
@@ -233,11 +233,11 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuForNonbo
 }
 
 bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarget,
-                                        const std::vector<int>    &gpuIdsToUse,
                                         const std::vector<int>    &userGpuTaskAssignment,
                                         const EmulateGpuNonbonded  emulateGpuNonbonded,
                                         const bool                 usingVerletScheme,
-                                        const bool                 nonbondedOnGpuIsUseful)
+                                        const bool                 nonbondedOnGpuIsUseful,
+                                        const bool                 gpusWereDetected)
 {
     if (nonbondedTarget == TaskTarget::Cpu)
     {
@@ -262,7 +262,7 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarg
                           ("Nonbonded interactions on the GPU were required, which is inconsistent "
                           "with choosing emulation. Make no more than one of these choices."));
         }
-        if (!gpuIdsToUse.empty() || !userGpuTaskAssignment.empty())
+        if (!userGpuTaskAssignment.empty())
         {
             GMX_THROW(InconsistentInputError
                           ("GPU ID usage was specified, as was GPU emulation. Make no more than one of these choices."));
@@ -306,13 +306,18 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarg
         return true;
     }
 
-    // We still don't know whether it is an error if no GPUs are found
-    // because we don't know the duty of this rank, yet. For example,
-    // a node with only PME ranks and -pme cpu is OK if there are not
-    // GPUs.
+    if (nonbondedTarget == TaskTarget::Gpu)
+    {
+        // We still don't know whether it is an error if no GPUs are found
+        // because we don't know the duty of this rank, yet. For example,
+        // a node with only PME ranks and -pme cpu is OK if there are not
+        // GPUs.
+        return true;
+    }
 
-    // If we get here, then the user permitted or required GPUs.
-    return true;
+    // If we get here, then the user permitted GPUs, which we should
+    // use for nonbonded interactions.
+    return gpusWereDetected;
 }
 
 bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
@@ -320,7 +325,8 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
                                   const std::vector<int> &userGpuTaskAssignment,
                                   const bool              canUseGpuForPme,
                                   const int               numRanksPerSimulation,
-                                  const int               numPmeRanksPerSimulation)
+                                  const int               numPmeRanksPerSimulation,
+                                  const bool              gpusWereDetected)
 {
     if (pmeTarget == TaskTarget::Cpu)
     {
@@ -389,11 +395,13 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
         return true;
     }
 
+    // If we get here, then the user permitted GPUs.
     if (numRanksPerSimulation == 1)
     {
-        // PME can run well on a single GPU shared with NB when
-        // there is one rank, so we permit mdrun to try that.
-        return true;
+        // PME can run well on a single GPU shared with NB when there
+        // is one rank, so we permit mdrun to try that if we have
+        // detected GPUs.
+        return gpusWereDetected;
     }
 
     // Not enough support for PME on GPUs for anything else
diff --git a/src/gromacs/taskassignment/decidegpuusage.h b/src/gromacs/taskassignment/decidegpuusage.h
index 438e00b8a6..7e01c99eab 100644
--- a/src/gromacs/taskassignment/decidegpuusage.h
+++ b/src/gromacs/taskassignment/decidegpuusage.h
@@ -133,22 +133,22 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuFor
  * consistency checks.
  *
  * \param[in]  nonbondedTarget           The user's choice for mdrun -nb for where to assign short-ranged nonbonded interaction tasks.
- * \param[in]  gpuIdsToUse               The compatible GPUs that the user permitted us to use.
  * \param[in]  userGpuTaskAssignment     The user-specified assignment of GPU tasks to device IDs.
  * \param[in]  emulateGpuNonbonded       Whether we will emulate GPU calculation of nonbonded interactions.
  * \param[in]  usingVerletScheme         Whether the nonbondeds are using the Verlet scheme.
  * \param[in]  nonbondedOnGpuIsUseful    Whether computing nonbonded interactions on a GPU is useful for this calculation.
+ * \param[in]  gpusWereDetected          Whether compatible GPUs were detected on any node.
  *
  * \returns    Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs.
  *
  * \throws     std::bad_alloc          If out of memory
  *             InconsistentInputError  If the user requirements are inconsistent. */
 bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarget,
-                                        const std::vector<int>    &gpuIdsToUse,
                                         const std::vector<int>    &userGpuTaskAssignment,
                                         const EmulateGpuNonbonded  emulateGpuNonbonded,
                                         const bool                 usingVerletScheme,
-                                        const bool                 nonbondedOnGpuIsUseful);
+                                        const bool                 nonbondedOnGpuIsUseful,
+                                        const bool                 gpusWereDetected);
 
 /*! \brief Decide whether the simulation will try to run tasks of
  * different types on GPUs.
@@ -172,6 +172,7 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarg
  * \param[in]  canUseGpuForPme           Whether the form of PME chosen can run on a GPU
  * \param[in]  numRanksPerSimulation     The number of ranks in each simulation.
  * \param[in]  numPmeRanksPerSimulation  The number of PME ranks in each simulation.
+ * \param[in]  gpusWereDetected          Whether compatible GPUs were detected on any node.
  *
  * \returns    Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs.
  *
@@ -182,7 +183,8 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
                                   const std::vector<int> &userGpuTaskAssignment,
                                   const bool              canUseGpuForPme,
                                   const int               numRanksPerSimulation,
-                                  const int               numPmeRanksPerSimulation);
+                                  const int               numPmeRanksPerSimulation,
+                                  const bool              gpusWereDetected);
 
 }
 
diff --git a/src/programs/mdrun/runner.cpp b/src/programs/mdrun/runner.cpp
index d0d7a0cb8b..7dabeb32e7 100644
--- a/src/programs/mdrun/runner.cpp
+++ b/src/programs/mdrun/runner.cpp
@@ -647,12 +647,20 @@ int Mdrunner::mdrunner()
     bool useGpuForPme       = false;
     try
     {
-        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment,
+        // It's possible that there are different numbers of GPUs on
+        // different nodes, which is the user's responsibilty to
+        // handle. If unsuitable, we will notice that during task
+        // assignment.
+        bool gpusWereDetected = hwinfo->ngpu_compatible_tot > 0;
+        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
                                                                 emulateGpuNonbonded, inputrec->cutoff_scheme == ecutsVERLET,
-                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun));
+                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun),
+                                                                gpusWereDetected);
         auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype);
         auto canUseGpuForPme   = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr);
-        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
+                                                    canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks,
+                                                    gpusWereDetected);
         pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
         if ((pmeRunMode == PmeRunMode::GPU) && (pmeFftTarget == TaskTarget::Cpu))
         {