Improve GPU update tasks assignment consistency
authorSzilárd Páll <pall.szilard@gmail.com>
Fri, 24 Jan 2020 13:12:28 +0000 (14:12 +0100)
committerPaul Bauer <paul.bauer.q@gmail.com>
Tue, 25 Feb 2020 16:45:23 +0000 (17:45 +0100)
GPU update task assignment was not consistent with the assumptions and
supported features of the 2020 release and did not implement the correct
checks and fallback in cases where GPU update was decided to not be
supported. Specifically, this change makes sure that when separate PME
ranks are used, without direct GPU communication for PP-PME, GPU update
falls back to the CPU.

Fixes #3354

Change-Id: I7c9dd67cd8cf61f0201b626b8b7674917e3365a5

src/gromacs/mdrun/runner.cpp
src/gromacs/taskassignment/decidegpuusage.cpp
src/gromacs/taskassignment/decidegpuusage.h

index f47caae0905ed13fa1a7355fd3db169081b018ea..93c934a9965807922dc74da7d12fb80030afc069 100644 (file)
 namespace gmx
 {
 
-/*! \brief Structure that holds boolean flags corresponding to the development
- *        features present enabled through environment variables.
- *
- */
-struct DevelopmentFeatureFlags
-{
-    //! True if the Buffer ops development feature is enabled
-    // TODO: when the trigger of the buffer ops offload is fully automated this should go away
-    bool enableGpuBufferOps = false;
-    //! If true, forces 'mdrun -update auto' default to 'gpu'
-    bool forceGpuUpdateDefault = false;
-    //! True if the GPU halo exchange development feature is enabled
-    bool enableGpuHaloExchange = false;
-    //! True if the PME PP direct communication GPU development feature is enabled
-    bool enableGpuPmePPComm = false;
-};
 
 /*! \brief Manage any development feature flag variables encountered
  *
@@ -1172,10 +1156,10 @@ int Mdrunner::mdrunner()
         const bool useUpdateGroups = cr->dd ? ddUsesUpdateGroups(*cr->dd) : false;
 
         useGpuForUpdate = decideWhetherToUseGpuForUpdate(
-                devFlags.forceGpuUpdateDefault, useDomainDecomposition, useUpdateGroups, pmeRunMode,
-                domdecOptions.numPmeRanks > 0, useGpuForNonbonded, updateTarget, gpusWereDetected,
-                *inputrec, mtop, doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
-                replExParams.exchangeInterval > 0, doRerun, mdlog);
+                useDomainDecomposition, useUpdateGroups, pmeRunMode, domdecOptions.numPmeRanks > 0,
+                useGpuForNonbonded, updateTarget, gpusWereDetected, *inputrec, mtop,
+                doEssentialDynamics, gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+                replExParams.exchangeInterval > 0, doRerun, devFlags, mdlog);
     }
     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
 
index fdb741421275ac190a3121d1803ff3a652314165..91779f29c858a3f68c9587635f09fac61a38bcb2 100644 (file)
@@ -520,48 +520,73 @@ bool decideWhetherToUseGpusForBonded(const bool       useGpuForNonbonded,
     return gpusWereDetected && usingOurCpuForPmeOrEwald;
 }
 
-bool decideWhetherToUseGpuForUpdate(const bool           forceGpuUpdateDefault,
-                                    const bool           isDomainDecomposition,
-                                    const bool           useUpdateGroups,
-                                    const PmeRunMode     pmeRunMode,
-                                    const bool           havePmeOnlyRank,
-                                    const bool           useGpuForNonbonded,
-                                    const TaskTarget     updateTarget,
-                                    const bool           gpusWereDetected,
-                                    const t_inputrec&    inputrec,
-                                    const gmx_mtop_t&    mtop,
-                                    const bool           useEssentialDynamics,
-                                    const bool           doOrientationRestraints,
-                                    const bool           useReplicaExchange,
-                                    const bool           doRerun,
-                                    const gmx::MDLogger& mdlog)
+bool decideWhetherToUseGpuForUpdate(const bool                     isDomainDecomposition,
+                                    const bool                     useUpdateGroups,
+                                    const PmeRunMode               pmeRunMode,
+                                    const bool                     havePmeOnlyRank,
+                                    const bool                     useGpuForNonbonded,
+                                    const TaskTarget               updateTarget,
+                                    const bool                     gpusWereDetected,
+                                    const t_inputrec&              inputrec,
+                                    const gmx_mtop_t&              mtop,
+                                    const bool                     useEssentialDynamics,
+                                    const bool                     doOrientationRestraints,
+                                    const bool                     useReplicaExchange,
+                                    const bool                     doRerun,
+                                    const DevelopmentFeatureFlags& devFlags,
+                                    const gmx::MDLogger&           mdlog)
 {
 
     // '-update cpu' overrides the environment variable, '-update auto' does not
-    if (updateTarget == TaskTarget::Cpu || (updateTarget == TaskTarget::Auto && !forceGpuUpdateDefault))
+    if (updateTarget == TaskTarget::Cpu
+        || (updateTarget == TaskTarget::Auto && !devFlags.forceGpuUpdateDefault))
     {
         return false;
     }
 
     const bool hasAnyConstraints = gmx_mtop_interaction_count(mtop, IF_CONSTRAINT) > 0;
+    const bool pmeUsesCpu = (pmeRunMode == PmeRunMode::CPU || pmeRunMode == PmeRunMode::Mixed);
 
     std::string errorMessage;
 
     if (isDomainDecomposition)
     {
-        if (!forceGpuUpdateDefault)
+        if (!devFlags.enableGpuHaloExchange)
         {
-            errorMessage += "Domain decomposition is not supported.\n ";
+            errorMessage += "Domain decomposition without GPU halo exchange is not supported.\n ";
         }
-        else if (hasAnyConstraints && !useUpdateGroups)
+        else
+        {
+            if (hasAnyConstraints && !useUpdateGroups)
+            {
+                errorMessage +=
+                        "Domain decomposition is only supported with constraints when update "
+                        "groups "
+                        "are used. This means constraining all bonds is not supported, except for "
+                        "small molecules, and box sizes close to half the pair-list cutoff are not "
+                        "supported.\n ";
+            }
+
+            if (pmeUsesCpu)
+            {
+                errorMessage += "With domain decomposition, PME must run fully on the GPU.\n";
+            }
+        }
+    }
+
+    if (havePmeOnlyRank)
+    {
+        if (pmeUsesCpu)
+        {
+            errorMessage += "With separate PME rank(s), PME must run fully on the GPU.\n";
+        }
+
+        if (!devFlags.enableGpuPmePPComm)
         {
-            errorMessage +=
-                    "Domain decomposition is only supported with constraints when update groups "
-                    "are used. This means constraining all bonds is not supported, except for "
-                    "small molecules, and box sizes close to half the pair-list cutoff are not "
-                    "supported.\n ";
+            errorMessage += "With separate PME rank(s), PME must use direct communication.\n";
         }
     }
+
     if (inputrec.eConstrAlg == econtSHAKE && hasAnyConstraints && gmx_mtop_ftype_count(mtop, F_CONSTR) > 0)
     {
         errorMessage += "SHAKE constraints are not supported.\n";
@@ -575,18 +600,6 @@ bool decideWhetherToUseGpuForUpdate(const bool           forceGpuUpdateDefault,
                 "Either PME or short-ranged non-bonded interaction tasks must run on the GPU.\n";
     }
 
-    // If PME is active (i.e. not PmeRunMode::None), then GPU update requires
-    // either a single-rank run, or that PME runs fully on the GPU.
-    const bool pmeRunningOnCpu = (pmeRunMode == PmeRunMode::CPU || pmeRunMode == PmeRunMode::Mixed);
-    if (pmeRunningOnCpu && isDomainDecomposition)
-    {
-        errorMessage += "With domain decomposition, PME must run fully on the GPU.\n";
-    }
-    if (pmeRunningOnCpu && havePmeOnlyRank)
-    {
-        errorMessage += "With separate PME rank(s), PME must run fully on the GPU.\n";
-    }
-
     if (!gpusWereDetected)
     {
         errorMessage += "Compatible GPUs must have been found.\n";
@@ -667,7 +680,7 @@ bool decideWhetherToUseGpuForUpdate(const bool           forceGpuUpdateDefault,
 
     if (!errorMessage.empty())
     {
-        if (updateTarget != TaskTarget::Gpu && forceGpuUpdateDefault)
+        if (updateTarget == TaskTarget::Auto && devFlags.forceGpuUpdateDefault)
         {
             GMX_LOG(mdlog.warning)
                     .asParagraph()
@@ -688,14 +701,8 @@ bool decideWhetherToUseGpuForUpdate(const bool           forceGpuUpdateDefault,
         return false;
     }
 
-    if (isDomainDecomposition)
-    {
-        return forceGpuUpdateDefault;
-    }
-    else
-    {
-        return (updateTarget == TaskTarget::Gpu || forceGpuUpdateDefault);
-    }
+    return (updateTarget == TaskTarget::Gpu
+            || (updateTarget == TaskTarget::Auto && devFlags.forceGpuUpdateDefault));
 }
 
 } // namespace gmx
index 921b0a778bdf14e626ea63cf8424c92038999405..bfb002547a36996a9beb535ee09d108c55b26826 100644 (file)
@@ -72,6 +72,25 @@ enum class EmulateGpuNonbonded : bool
     Yes
 };
 
+/*! \libinternal
+ *  \brief Structure that holds boolean flags corresponding to the development
+ *        features present enabled through environment variables.
+ *
+ */
+struct DevelopmentFeatureFlags
+{
+    //! True if the Buffer ops development feature is enabled
+    // TODO: when the trigger of the buffer ops offload is fully automated this should go away
+    bool enableGpuBufferOps = false;
+    //! If true, forces 'mdrun -update auto' default to 'gpu'
+    bool forceGpuUpdateDefault = false;
+    //! True if the GPU halo exchange development feature is enabled
+    bool enableGpuHaloExchange = false;
+    //! True if the PME PP direct communication GPU development feature is enabled
+    bool enableGpuPmePPComm = false;
+};
+
+
 class MDAtoms;
 
 /*! \brief Decide whether this thread-MPI simulation will run
@@ -249,7 +268,6 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
 
 /*! \brief Decide whether to use GPU for update.
  *
- * \param[in]  forceGpuUpdateDefault        If update should run on GPU by default.
  * \param[in]  isDomainDecomposition        Whether there more than one domain.
  * \param[in]  useUpdateGroups              If the constraints can be split across domains.
  * \param[in]  pmeRunMode                   PME running mode: CPU, GPU or mixed.
@@ -263,27 +281,28 @@ bool decideWhetherToUseGpusForBonded(bool       useGpuForNonbonded,
  * \param[in]  doOrientationRestraints      If orientation restraints are enabled.
  * \param[in]  useReplicaExchange           If this is a REMD simulation.
  * \param[in]  doRerun                      It this is a rerun.
+ * \param[in]  devFlags                     GPU development / experimental feature flags.
  * \param[in]  mdlog                        MD logger.
  *
  * \returns    Whether complete simulation can be run on GPU.
  * \throws     std::bad_alloc            If out of memory
  *             InconsistentInputError    If the user requirements are inconsistent.
  */
-bool decideWhetherToUseGpuForUpdate(bool                 forceGpuUpdateDefault,
-                                    bool                 isDomainDecomposition,
-                                    bool                 useUpdateGroups,
-                                    PmeRunMode           pmeRunMode,
-                                    bool                 havePmeOnlyRank,
-                                    bool                 useGpuForNonbonded,
-                                    TaskTarget           updateTarget,
-                                    bool                 gpusWereDetected,
-                                    const t_inputrec&    inputrec,
-                                    const gmx_mtop_t&    mtop,
-                                    bool                 useEssentialDynamics,
-                                    bool                 doOrientationRestraints,
-                                    bool                 useReplicaExchange,
-                                    bool                 doRerun,
-                                    const gmx::MDLogger& mdlog);
+bool decideWhetherToUseGpuForUpdate(bool                           isDomainDecomposition,
+                                    bool                           useUpdateGroups,
+                                    PmeRunMode                     pmeRunMode,
+                                    bool                           havePmeOnlyRank,
+                                    bool                           useGpuForNonbonded,
+                                    TaskTarget                     updateTarget,
+                                    bool                           gpusWereDetected,
+                                    const t_inputrec&              inputrec,
+                                    const gmx_mtop_t&              mtop,
+                                    bool                           useEssentialDynamics,
+                                    bool                           doOrientationRestraints,
+                                    bool                           useReplicaExchange,
+                                    bool                           doRerun,
+                                    const DevelopmentFeatureFlags& devFlags,
+                                    const gmx::MDLogger&           mdlog);
 
 
 } // namespace gmx