Disable DLB if GPU direct communication Halo exchange is enabled.
authorGaurav Garg <gaugarg@nvidia.com>
Fri, 27 Aug 2021 07:00:46 +0000 (07:00 +0000)
committerMark Abraham <mark.j.abraham@gmail.com>
Fri, 27 Aug 2021 07:00:46 +0000 (07:00 +0000)
src/gromacs/domdec/builder.h
src/gromacs/domdec/domdec.cpp
src/gromacs/mdrun/runner.cpp

index aadd54cc5973990bbd5f284d586bdc9793d1d05f..e5b9a8d92b00f370acc43b964b6da5ac6945ae77 100644 (file)
@@ -91,7 +91,8 @@ public:
                                real                              maxUpdateGroupRadius,
                                ArrayRef<const RVec>              xGlobal,
                                bool                              useGpuForNonbonded,
-                               bool                              useGpuForPme);
+                               bool                              useGpuForPme,
+                               bool                              directGpuCommUsedWithGpuUpdate);
     //! Destructor
     ~DomainDecompositionBuilder();
     //! Build the resulting DD manager
index 3627ea02d6dba4cf196572d380554b653764b384..5f26c408481443a0ec67af0691956ecba4843d33 100644 (file)
@@ -1803,18 +1803,20 @@ static DlbState forceDlbOffOrBail(DlbState             cmdlineDlbState,
  * state with other run parameters and settings. As a result, the initial state
  * may be altered or an error may be thrown if incompatibility of options is detected.
  *
- * \param [in] mdlog       Logger.
- * \param [in] dlbOption   Enum value for the DLB option.
- * \param [in] bRecordLoad True if the load balancer is recording load information.
- * \param [in] mdrunOptions  Options for mdrun.
- * \param [in] inputrec    Pointer mdrun to input parameters.
- * \returns                DLB initial/startup state.
+ * \param [in] mdlog                Logger.
+ * \param [in] dlbOption            Enum value for the DLB option.
+ * \param [in] bRecordLoad          True if the load balancer is recording load information.
+ * \param [in] mdrunOptions         Options for mdrun.
+ * \param [in] inputrec             Pointer mdrun to input parameters.
+ * \param [in] directGpuCommUsedWithGpuUpdate     Direct GPU halo exchange and GPU update enabled
+ * \returns                         DLB initial/startup state.
  */
 static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
                                          DlbOption                dlbOption,
                                          gmx_bool                 bRecordLoad,
                                          const gmx::MdrunOptions& mdrunOptions,
-                                         const t_inputrec&        inputrec)
+                                         const t_inputrec&        inputrec,
+                                         const bool               directGpuCommUsedWithGpuUpdate)
 {
     DlbState dlbState = DlbState::offCanTurnOn;
 
@@ -1826,6 +1828,15 @@ static DlbState determineInitialDlbState(const gmx::MDLogger&     mdlog,
         default: gmx_incons("Invalid dlbOption enum value");
     }
 
+    // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple timesteps
+    // DLB needs to be disabled in that case
+    if (directGpuCommUsedWithGpuUpdate)
+    {
+        std::string reasonStr =
+                "it is not supported with GPU direct communication + GPU update enabled.";
+        return forceDlbOffOrBail(dlbState, reasonStr, mdlog);
+    }
+
     /* Reruns don't support DLB: bail or override auto mode */
     if (mdrunOptions.rerun)
     {
@@ -2775,7 +2786,8 @@ static void set_ddgrid_parameters(const gmx::MDLogger& mdlog,
 static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
                                 const DomdecOptions&     options,
                                 const gmx::MdrunOptions& mdrunOptions,
-                                const t_inputrec&        ir)
+                                const t_inputrec&        ir,
+                                const bool               directGpuCommUsedWithGpuUpdate)
 {
     DDSettings ddSettings;
 
@@ -2808,8 +2820,8 @@ static DDSettings getDDSettings(const gmx::MDLogger&     mdlog,
         ddSettings.recordLoad = (wallcycle_have_counter() && recload > 0);
     }
 
-    ddSettings.initialDlbState =
-            determineInitialDlbState(mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir);
+    ddSettings.initialDlbState = determineInitialDlbState(
+            mdlog, options.dlbOption, ddSettings.recordLoad, mdrunOptions, ir, directGpuCommUsedWithGpuUpdate);
     GMX_LOG(mdlog.info)
             .appendTextFormatted("Dynamic load balancing: %s",
                                  enumValueToString(ddSettings.initialDlbState));
@@ -2844,7 +2856,8 @@ public:
          real                              maxUpdateGroupRadius,
          ArrayRef<const RVec>              xGlobal,
          bool                              useGpuForNonbonded,
-         bool                              useGpuForPme);
+         bool                              useGpuForPme,
+         bool                              directGpuCommUsedWithGpuUpdate);
 
     //! Build the resulting DD manager
     gmx_domdec_t* build(LocalAtomSetManager* atomSets);
@@ -2899,12 +2912,13 @@ DomainDecompositionBuilder::Impl::Impl(const MDLogger&                   mdlog,
                                        const real                        maxUpdateGroupRadius,
                                        ArrayRef<const RVec>              xGlobal,
                                        bool                              useGpuForNonbonded,
-                                       bool                              useGpuForPme) :
+                                       bool                              useGpuForPme,
+                                       bool directGpuCommUsedWithGpuUpdate) :
     mdlog_(mdlog), cr_(cr), options_(options), mtop_(mtop), ir_(ir), notifiers_(notifiers)
 {
     GMX_LOG(mdlog_.info).appendTextFormatted("\nInitializing Domain Decomposition on %d ranks", cr_->sizeOfDefaultCommunicator);
 
-    ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_);
+    ddSettings_ = getDDSettings(mdlog_, options_, mdrunOptions, ir_, directGpuCommUsedWithGpuUpdate);
 
     if (ddSettings_.eFlop > 1)
     {
@@ -3038,7 +3052,8 @@ DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&
                                                        const real           maxUpdateGroupRadius,
                                                        ArrayRef<const RVec> xGlobal,
                                                        const bool           useGpuForNonbonded,
-                                                       const bool           useGpuForPme) :
+                                                       const bool           useGpuForPme,
+                                                       const bool directGpuCommUsedWithGpuUpdate) :
     impl_(new Impl(mdlog,
                    cr,
                    options,
@@ -3052,7 +3067,8 @@ DomainDecompositionBuilder::DomainDecompositionBuilder(const MDLogger&
                    maxUpdateGroupRadius,
                    xGlobal,
                    useGpuForNonbonded,
-                   useGpuForPme))
+                   useGpuForPme,
+                   directGpuCommUsedWithGpuUpdate))
 {
 }
 
index 64ada7d8d8a681ca4794da9735a091ce03a2b9c6..98cd178a96ed670ae7163fa4ab18e9639d0aea66 100644 (file)
@@ -1298,6 +1298,45 @@ int Mdrunner::mdrunner()
                                                  systemHasConstraintsOrVsites(mtop),
                                                  cutoffMargin);
 
+    try
+    {
+        const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
+
+        useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
+                                                         updateGroups.useUpdateGroups(),
+                                                         pmeRunMode,
+                                                         domdecOptions.numPmeRanks > 0,
+                                                         useGpuForNonbonded,
+                                                         updateTarget,
+                                                         gpusWereDetected,
+                                                         *inputrec,
+                                                         mtop,
+                                                         doEssentialDynamics,
+                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
+                                                         haveFrozenAtoms,
+                                                         doRerun,
+                                                         devFlags,
+                                                         mdlog);
+    }
+    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
+
+    bool useGpuDirectHalo = false;
+
+    if (useGpuForNonbonded)
+    {
+        // cr->npmenodes is not yet initialized.
+        // domdecOptions.numPmeRanks == -1 results in 0 separate PME ranks when useGpuForNonbonded is true.
+        // Todo: remove this assumption later once auto mode has support for separate PME rank
+        const int numPmeRanks = domdecOptions.numPmeRanks > 0 ? domdecOptions.numPmeRanks : 0;
+        bool      havePPDomainDecomposition = (cr->sizeOfDefaultCommunicator - numPmeRanks) > 1;
+        useGpuDirectHalo                    = decideWhetherToUseGpuForHalo(devFlags,
+                                                        havePPDomainDecomposition,
+                                                        useGpuForNonbonded,
+                                                        useModularSimulator,
+                                                        doRerun,
+                                                        EI_ENERGY_MINIMIZATION(inputrec->eI));
+    }
+
     // This builder is necessary while we have multi-part construction
     // of DD. Before DD is constructed, we use the existence of
     // the builder object to indicate that further construction of DD
@@ -1305,7 +1344,10 @@ int Mdrunner::mdrunner()
     std::unique_ptr<DomainDecompositionBuilder> ddBuilder;
     if (useDomainDecomposition)
     {
-        ddBuilder = std::make_unique<DomainDecompositionBuilder>(
+        // P2P GPU comm + GPU update leads to case in which we enqueue async work for multiple
+        // timesteps. DLB needs to be disabled in that case
+        const bool directGpuCommUsedWithGpuUpdate = GMX_THREAD_MPI && useGpuDirectHalo && useGpuForUpdate;
+        ddBuilder                                 = std::make_unique<DomainDecompositionBuilder>(
                 mdlog,
                 cr,
                 domdecOptions,
@@ -1319,7 +1361,8 @@ int Mdrunner::mdrunner()
                 updateGroups.maxUpdateGroupRadius(),
                 positionsFromStatePointer(globalState.get()),
                 useGpuForNonbonded,
-                useGpuForPme);
+                useGpuForPme,
+                directGpuCommUsedWithGpuUpdate);
     }
     else
     {
@@ -1403,32 +1446,6 @@ int Mdrunner::mdrunner()
         }
     }
 
-    // The GPU update is decided here because we need to know whether the constraints or
-    // SETTLEs can span across the domain borders (i.e. whether or not update groups are
-    // defined). This is only known after DD is initialized, hence decision on using GPU
-    // update is done so late.
-    try
-    {
-        const bool haveFrozenAtoms = inputrecFrozenAtoms(inputrec.get());
-
-        useGpuForUpdate = decideWhetherToUseGpuForUpdate(useDomainDecomposition,
-                                                         updateGroups.useUpdateGroups(),
-                                                         pmeRunMode,
-                                                         domdecOptions.numPmeRanks > 0,
-                                                         useGpuForNonbonded,
-                                                         updateTarget,
-                                                         gpusWereDetected,
-                                                         *inputrec,
-                                                         mtop,
-                                                         doEssentialDynamics,
-                                                         gmx_mtop_ftype_count(mtop, F_ORIRES) > 0,
-                                                         haveFrozenAtoms,
-                                                         doRerun,
-                                                         devFlags,
-                                                         mdlog);
-    }
-    GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
-
     const bool printHostName = (cr->nnodes > 1);
     gpuTaskAssignments.reportGpuUsage(mdlog, printHostName, useGpuForBonded, pmeRunMode, useGpuForUpdate);
 
@@ -1445,13 +1462,6 @@ int Mdrunner::mdrunner()
 
     MdrunScheduleWorkload runScheduleWork;
 
-    bool useGpuDirectHalo = decideWhetherToUseGpuForHalo(devFlags,
-                                                         havePPDomainDecomposition(cr),
-                                                         useGpuForNonbonded,
-                                                         useModularSimulator,
-                                                         doRerun,
-                                                         EI_ENERGY_MINIMIZATION(inputrec->eI));
-
     // Also populates the simulation constant workload description.
     // Note: currently the default duty is DUTY_PP | DUTY_PME for all simulations, including those without PME,
     // so this boolean is sufficient on all ranks to determine whether separate PME ranks are used,