Fix mdrun -nb auto -pme auto when GPUs are absent
authorMark Abraham <mark.j.abraham@gmail.com>
Tue, 5 Dec 2017 09:27:21 +0000 (20:27 +1100)
committerKasson <kasson@gmail.com>
Tue, 5 Dec 2017 13:10:07 +0000 (14:10 +0100)
The logic was flawed such that GPUs were "selected" for use even
though none had been detected. That led to the GPU behaviour of
avoiding using separate PME ranks.

Also made a minor fix to the logic for emulation. The new
interpretation of mdrun -gpu_id does not need to trigger an error when
GPU IDs have been supplied along with the emulation environmnet
variable.

Fixes #2315

Change-Id: I68da27c9bfef9f73b9dae4f04f196066d2efb1e2

src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/decidegpuusage.h
src/programs/mdrun/runner.cpp

index 6002bb913bb0639fabe885db7fd735b29aec4f11..d5132add212e7c986de4c39332978081643d1468 100644 (file)
@@ -233,11 +233,11 @@ decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuForNonbo
 }
 
 bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarget,
-                                        const std::vector<int>    &gpuIdsToUse,
                                         const std::vector<int>    &userGpuTaskAssignment,
                                         const EmulateGpuNonbonded  emulateGpuNonbonded,
                                         const bool                 usingVerletScheme,
-                                        const bool                 nonbondedOnGpuIsUseful)
+                                        const bool                 nonbondedOnGpuIsUseful,
+                                        const bool                 gpusWereDetected)
 {
     if (nonbondedTarget == TaskTarget::Cpu)
     {
@@ -262,7 +262,7 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarg
                           ("Nonbonded interactions on the GPU were required, which is inconsistent "
                           "with choosing emulation. Make no more than one of these choices."));
         }
-        if (!gpuIdsToUse.empty() || !userGpuTaskAssignment.empty())
+        if (!userGpuTaskAssignment.empty())
         {
             GMX_THROW(InconsistentInputError
                           ("GPU ID usage was specified, as was GPU emulation. Make no more than one of these choices."));
@@ -306,13 +306,18 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarg
         return true;
     }
 
-    // We still don't know whether it is an error if no GPUs are found
-    // because we don't know the duty of this rank, yet. For example,
-    // a node with only PME ranks and -pme cpu is OK if there are not
-    // GPUs.
+    if (nonbondedTarget == TaskTarget::Gpu)
+    {
+        // We still don't know whether it is an error if no GPUs are found
+        // because we don't know the duty of this rank, yet. For example,
+        // a node with only PME ranks and -pme cpu is OK if there are not
+        // GPUs.
+        return true;
+    }
 
-    // If we get here, then the user permitted or required GPUs.
-    return true;
+    // If we get here, then the user permitted GPUs, which we should
+    // use for nonbonded interactions.
+    return gpusWereDetected;
 }
 
 bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
@@ -320,7 +325,8 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
                                   const std::vector<int> &userGpuTaskAssignment,
                                   const bool              canUseGpuForPme,
                                   const int               numRanksPerSimulation,
-                                  const int               numPmeRanksPerSimulation)
+                                  const int               numPmeRanksPerSimulation,
+                                  const bool              gpusWereDetected)
 {
     if (pmeTarget == TaskTarget::Cpu)
     {
@@ -389,11 +395,13 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
         return true;
     }
 
+    // If we get here, then the user permitted GPUs.
     if (numRanksPerSimulation == 1)
     {
-        // PME can run well on a single GPU shared with NB when
-        // there is one rank, so we permit mdrun to try that.
-        return true;
+        // PME can run well on a single GPU shared with NB when there
+        // is one rank, so we permit mdrun to try that if we have
+        // detected GPUs.
+        return gpusWereDetected;
     }
 
     // Not enough support for PME on GPUs for anything else
index 438e00b8a6e48a23e6d95ade1a3ad95ed37db12c..7e01c99eabf3e9f01fb8dd62396125b4a27a08a3 100644 (file)
@@ -133,22 +133,22 @@ bool decideWhetherToUseGpusForPmeWithThreadMpi(const bool              useGpuFor
  * consistency checks.
  *
  * \param[in]  nonbondedTarget           The user's choice for mdrun -nb for where to assign short-ranged nonbonded interaction tasks.
- * \param[in]  gpuIdsToUse               The compatible GPUs that the user permitted us to use.
  * \param[in]  userGpuTaskAssignment     The user-specified assignment of GPU tasks to device IDs.
  * \param[in]  emulateGpuNonbonded       Whether we will emulate GPU calculation of nonbonded interactions.
  * \param[in]  usingVerletScheme         Whether the nonbondeds are using the Verlet scheme.
  * \param[in]  nonbondedOnGpuIsUseful    Whether computing nonbonded interactions on a GPU is useful for this calculation.
+ * \param[in]  gpusWereDetected          Whether compatible GPUs were detected on any node.
  *
  * \returns    Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs.
  *
  * \throws     std::bad_alloc          If out of memory
  *             InconsistentInputError  If the user requirements are inconsistent. */
 bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarget,
-                                        const std::vector<int>    &gpuIdsToUse,
                                         const std::vector<int>    &userGpuTaskAssignment,
                                         const EmulateGpuNonbonded  emulateGpuNonbonded,
                                         const bool                 usingVerletScheme,
-                                        const bool                 nonbondedOnGpuIsUseful);
+                                        const bool                 nonbondedOnGpuIsUseful,
+                                        const bool                 gpusWereDetected);
 
 /*! \brief Decide whether the simulation will try to run tasks of
  * different types on GPUs.
@@ -172,6 +172,7 @@ bool decideWhetherToUseGpusForNonbonded(const TaskTarget           nonbondedTarg
  * \param[in]  canUseGpuForPme           Whether the form of PME chosen can run on a GPU
  * \param[in]  numRanksPerSimulation     The number of ranks in each simulation.
  * \param[in]  numPmeRanksPerSimulation  The number of PME ranks in each simulation.
+ * \param[in]  gpusWereDetected          Whether compatible GPUs were detected on any node.
  *
  * \returns    Whether the simulation will run nonbonded and PME tasks, respectively, on GPUs.
  *
@@ -182,7 +183,8 @@ bool decideWhetherToUseGpusForPme(const bool              useGpuForNonbonded,
                                   const std::vector<int> &userGpuTaskAssignment,
                                   const bool              canUseGpuForPme,
                                   const int               numRanksPerSimulation,
-                                  const int               numPmeRanksPerSimulation);
+                                  const int               numPmeRanksPerSimulation,
+                                  const bool              gpusWereDetected);
 
 }
 
index d0d7a0cb8b5668300740854d07a17a63b0502408..7dabeb32e73c1ecb7c4c22040af56282bcf3b7d0 100644 (file)
@@ -647,12 +647,20 @@ int Mdrunner::mdrunner()
     bool useGpuForPme       = false;
     try
     {
-        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, gpuIdsToUse, userGpuTaskAssignment,
+        // It's possible that there are different numbers of GPUs on
+        // different nodes, which is the user's responsibilty to
+        // handle. If unsuitable, we will notice that during task
+        // assignment.
+        bool gpusWereDetected = hwinfo->ngpu_compatible_tot > 0;
+        useGpuForNonbonded = decideWhetherToUseGpusForNonbonded(nonbondedTarget, userGpuTaskAssignment,
                                                                 emulateGpuNonbonded, inputrec->cutoff_scheme == ecutsVERLET,
-                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun));
+                                                                gpuAccelerationOfNonbondedIsUseful(mdlog, inputrec, doRerun),
+                                                                gpusWereDetected);
         auto inputSystemHasPme = EEL_PME(inputrec->coulombtype) || EVDW_PME(inputrec->vdwtype);
         auto canUseGpuForPme   = inputSystemHasPme && pme_gpu_supports_input(inputrec, nullptr);
-        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment, canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks);
+        useGpuForPme = decideWhetherToUseGpusForPme(useGpuForNonbonded, pmeTarget, userGpuTaskAssignment,
+                                                    canUseGpuForPme, cr->nnodes, domdecOptions.numPmeRanks,
+                                                    gpusWereDetected);
         pmeRunMode   = (useGpuForPme ? PmeRunMode::GPU : PmeRunMode::CPU);
         if ((pmeRunMode == PmeRunMode::GPU) && (pmeFftTarget == TaskTarget::Cpu))
         {