Move buffer ops / PME F reduction flags into StepWorkload

author Szilárd Páll <pall.szilard@gmail.com>

Wed, 9 Oct 2019 00:53:24 +0000 (02:53 +0200)

committer Artem Zhmurov <zhmurov@gmail.com>

Sun, 13 Oct 2019 11:10:20 +0000 (13:10 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Wed, 9 Oct 2019 00:53:24 +0000 (02:53 +0200)
committer Artem Zhmurov <zhmurov@gmail.com>
Sun, 13 Oct 2019 11:10:20 +0000 (13:10 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 8386e341393cb5b6050b380fd9836c92636bc4b9..32c30aab8f86de8c2221f5ab43f3e31ed3637b2e 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -613,17 +613,15 @@ static int makePmeFlags(const StepWorkload &stepWork)
   * \param[in]  box                  The box matrix
   * \param[in]  stepWork             Step schedule flags
   * \param[in]  pmeFlags             PME flags
- * \param[in]  useGpuForceReduction True if GPU-based force reduction is active this step
   * \param[in]  wcycle               The wallcycle structure
   */
  static inline void launchPmeGpuSpread(gmx_pme_t          *pmedata,
                                        const matrix        box,
                                        const StepWorkload &stepWork,
                                        int                 pmeFlags,
-                                      bool                useGpuForceReduction,
                                        gmx_wallcycle_t     wcycle)
  {
-    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, useGpuForceReduction);
+    pme_gpu_prepare_computation(pmedata, stepWork.haveDynamicBox, box, wcycle, pmeFlags, stepWork.useGpuPmeFReduction);
      pme_gpu_launch_spread(pmedata, wcycle);
  }
  
@@ -795,11 +793,16 @@ setupDomainLifetimeWorkload(const t_inputrec       &inputrec,
   *
   * \param[in]      legacyFlags          Force bitmask flags used to construct the new flags
   * \param[in]      isNonbondedOn        Global override, if false forces to turn off all nonbonded calculation.
+ * \param[in]      simulationWork       Simulation workload description.
+ * \param[in]      rankHasPmeDuty       If this rank computes PME.
+ *
   * \returns New Stepworkload description.
   */
  static StepWorkload
-setupStepWorkload(const int     legacyFlags,
-                  const bool    isNonbondedOn)
+setupStepWorkload(const int                 legacyFlags,
+                  const bool                isNonbondedOn,
+                  const SimulationWorkload &simulationWork,
+                  const bool                rankHasPmeDuty)
  {
      StepWorkload flags;
      flags.stateChanged           = ((legacyFlags & GMX_FORCE_STATECHANGED) != 0);
@@ -811,6 +814,17 @@ setupStepWorkload(const int     legacyFlags,
      flags.computeListedForces    = ((legacyFlags & GMX_FORCE_LISTED) != 0);
      flags.computeNonbondedForces = ((legacyFlags & GMX_FORCE_NONBONDED) != 0) && isNonbondedOn;
      flags.computeDhdl            = ((legacyFlags & GMX_FORCE_DHDL) != 0);
+
+    if (simulationWork.useGpuBufferOps)
+    {
+        GMX_ASSERT(simulationWork.useGpuNonbonded, "Can only offload buffer ops if nonbonded computation is also offloaded");
+    }
+    flags.useGpuXBufferOps = simulationWork.useGpuBufferOps;
+    // on virial steps the CPU reduction path is taken
+    // TODO: remove flags.computeEnergy, ref #3128
+    flags.useGpuFBufferOps    = simulationWork.useGpuBufferOps && !(flags.computeVirial || flags.computeEnergy);
+    flags.useGpuPmeFReduction = flags.useGpuFBufferOps && (simulationWork.usePmeGpu && rankHasPmeDuty);
+
      return flags;
  }
  
@@ -911,25 +925,22 @@ void do_force(FILE                                     *fplog,
          legacyFlags &= ~GMX_FORCE_NONBONDED;
      }
  
-    runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded);
-    const StepWorkload       &stepWork = runScheduleWork->stepWork;
-
      const SimulationWorkload &simulationWork = runScheduleWork->simulationWork;
  
-    const bool                useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
-    const int                 pmeFlags            = makePmeFlags(stepWork);
+
+    runScheduleWork->stepWork = setupStepWorkload(legacyFlags, fr->bNonbonded,
+                                                  simulationWork, thisRankHasDuty(cr, DUTY_PME));
+    const StepWorkload &stepWork = runScheduleWork->stepWork;
+
+
+    const bool useGpuPmeOnThisRank = simulationWork.usePmeGpu && thisRankHasDuty(cr, DUTY_PME);
+    const int  pmeFlags            = makePmeFlags(stepWork);
  
      // Switches on whether to use GPU for position and force buffer operations
      // TODO consider all possible combinations of triggers, and how to combine optimally in each case.
-    const BufferOpsUseGpu useGpuXBufOps = (simulationWork.useGpuBufferOps &&
-                                           simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;;
+    const BufferOpsUseGpu useGpuXBufOps = stepWork.useGpuXBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
      // GPU Force buffer ops are disabled on virial steps, because the virial calc is not yet ported to GPU
-    const BufferOpsUseGpu useGpuFBufOps = ((simulationWork.useGpuBufferOps &&
-                                            simulationWork.useGpuNonbonded && (GMX_GPU == GMX_GPU_CUDA)) &&
-                                           !(stepWork.computeVirial || stepWork.computeEnergy)) ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
-    // TODO: move / add this flag to the internal PME GPU data structures
-    const bool useGpuPmeFReduction = (useGpuFBufOps == BufferOpsUseGpu::True) &&
-        useGpuPmeOnThisRank; // only supported if this rank is perfoming PME on the GPU
+    const BufferOpsUseGpu useGpuFBufOps = stepWork.useGpuFBufferOps ? BufferOpsUseGpu::True : BufferOpsUseGpu::False;
  
      /* At a search step we need to start the first balancing region
       * somewhere early inside the step after communication during domain
@@ -1019,7 +1030,7 @@ void do_force(FILE                                     *fplog,
  
      if (useGpuPmeOnThisRank)
      {
-        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, useGpuPmeFReduction, wcycle);
+        launchPmeGpuSpread(fr->pmedata, box, stepWork, pmeFlags, wcycle);
      }
  
      /* do gridding for pair search */
@@ -1533,7 +1544,7 @@ void do_force(FILE                                     *fplog,
      }
  
      const bool useGpuForcesHaloExchange = ddUsesGpuDirectCommunication && (useGpuFBufOps == BufferOpsUseGpu::True);
-    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !useGpuPmeFReduction;
+    const bool useCpuPmeFReduction      = thisRankHasDuty(cr, DUTY_PME) && !stepWork.useGpuPmeFReduction;
      // TODO: move this into DomainLifetimeWorkload, including the second part of the condition
      const bool haveCpuLocalForces     = (domainWork.haveSpecialForces || domainWork.haveCpuListedForceWork || useCpuPmeFReduction ||
                                           (fr->efep != efepNO));
@@ -1636,7 +1647,7 @@ void do_force(FILE                                     *fplog,
      {
          gmx::FixedCapacityVector<GpuEventSynchronizer*, 2> dependencyList;
  
-        if (useGpuPmeFReduction)
+        if (stepWork.useGpuPmeFReduction)
          {
              dependencyList.push_back(pme_gpu_get_f_ready_synchronizer(fr->pmedata));
          }
@@ -1679,7 +1690,7 @@ void do_force(FILE                                     *fplog,
                                                stateGpu->getForces(),
                                                pme_gpu_get_device_f(fr->pmedata),
                                                dependencyList,
-                                              useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
+                                              stepWork.useGpuPmeFReduction, haveLocalForceContribInCpuBuffer);
              // This function call synchronizes the local stream
              nbv->wait_for_gpu_force_reduction(Nbnxm::AtomLocality::Local);
              stateGpu->copyForcesFromGpu(forceWithShift, gmx::StatePropagatorDataGpu::AtomLocality::Local);
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp

index fdc93ac07e2827483542a22c9371b5977d4f2c8d..75dc6d6fea58e620db30f7957ee0545f6eb4f895 100644 (file)
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -194,14 +194,16 @@ struct DevelopmentFeatureFlags
   * Note that some development features overrides are applied already here:
   * the GPU communication flags are set to false in non-tMPI and non-CUDA builds.
   *
- * \param[in]  mdlog        Logger object.
- * \returns                 The object populated with development feature flags.
+ * \param[in]  mdlog                Logger object.
+ * \param[in]  useGpuForNonbonded   True if the nonbonded task is offloaded in this run.
+ * \returns                         The object populated with development feature flags.
   */
-static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog)
+static DevelopmentFeatureFlags manageDevelopmentFeatures(const gmx::MDLogger &mdlog,
+                                                         const bool           useGpuForNonbonded)
  {
      DevelopmentFeatureFlags devFlags;
  
-    devFlags.enableGpuBufferOps    = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr);
+    devFlags.enableGpuBufferOps    = (getenv("GMX_USE_GPU_BUFFER_OPS") != nullptr) && (GMX_GPU == GMX_GPU_CUDA) && useGpuForNonbonded;
      devFlags.useGpuUpdateConstrain = (getenv("GMX_UPDATE_CONSTRAIN_GPU") != nullptr);
      devFlags.enableGpuHaloExchange = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
      devFlags.enableGpuPmePPComm    = (getenv("GMX_GPU_DD_COMMS") != nullptr && GMX_THREAD_MPI && (GMX_GPU == GMX_GPU_CUDA));
@@ -683,9 +685,6 @@ int Mdrunner::mdrunner()
      gmx::LoggerOwner logOwner(buildLogger(fplog, isSimulationMasterRank));
      gmx::MDLogger    mdlog(logOwner.logger());
  
-    // report any development features that may be enabled by environment variables
-    const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog);
-
      // TODO The thread-MPI master rank makes a working
      // PhysicalNodeCommunicator here, but it gets rebuilt by all ranks
      // after the threads have been launched. This works because no use
@@ -845,6 +844,10 @@ int Mdrunner::mdrunner()
      }
      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
  
+    // Initialize development feature flags that enabled by environment variable
+    // and report those features that are enabled.
+    const DevelopmentFeatureFlags devFlags = manageDevelopmentFeatures(mdlog, useGpuForNonbonded);
+
      // Build restraints.
      // TODO: hide restraint implementation details from Mdrunner.
      // There is nothing unique about restraints at this point as far as the
diff --git a/src/gromacs/mdtypes/simulation_workload.h b/src/gromacs/mdtypes/simulation_workload.h

index e20bcc9e7cfedfb457ab646e025fe2f4973420d9..147199771cb7ed5d5ff396b7e90fb4c945a9d2c6 100644 (file)
--- a/src/gromacs/mdtypes/simulation_workload.h
+++ b/src/gromacs/mdtypes/simulation_workload.h
@@ -77,6 +77,16 @@ class StepWorkload
          bool computeListedForces = false;
          //! Whether this step DHDL needs to be computed
          bool computeDhdl = false;
+        /*! \brief Whether coordinate buffer ops are done on the GPU this step
+         * \note This technically belongs to DomainLifetimeWorkload but due
+         * to needing the flag before DomainLifetimeWorkload is built we keep
+         * it here for now.
+         */
+        bool useGpuXBufferOps  = false;
+        //! Whether force buffer ops are done on the GPU this step
+        bool useGpuFBufferOps  = false;
+        //! Whether PME forces are reduced with other contributions on the GPU this step
+        bool useGpuPmeFReduction = false; // TODO: add this flag to the internal PME GPU data structures too
  };
  
  /*! \libinternal
author	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 9 Oct 2019 00:53:24 +0000 (02:53 +0200)
committer	Artem Zhmurov <zhmurov@gmail.com>
	Sun, 13 Oct 2019 11:10:20 +0000 (13:10 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/mdrun/runner.cpp		patch \| blob \| history
src/gromacs/mdtypes/simulation_workload.h		patch \| blob \| history