Move nonbonded GPU wait timing into the GPU module

author Szilárd Páll <pall.szilard@gmail.com>

Mon, 29 Jul 2019 17:22:10 +0000 (19:22 +0200)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 28 Aug 2019 04:35:19 +0000 (06:35 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Mon, 29 Jul 2019 17:22:10 +0000 (19:22 +0200)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 28 Aug 2019 04:35:19 +0000 (06:35 +0200)
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp

index 6a0885f9a25784b40dcfb60c4da970582c522d88..3a87eefb0ab0b7018f87f3d0ac49f146a08139a5 100644 (file)
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -682,23 +682,15 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
          if (!isNbGpuDone)
          {
              GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
              isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
                                                       flags,
                                                       Nbnxm::AtomLocality::Local,
                                                       enerd->grpp.ener[egLJSR].data(),
                                                       enerd->grpp.ener[egCOULSR].data(),
-                                                     forceWithShiftForces.shiftForces(), completionType);
-            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-            // To get the call count right, when the task finished we
-            // issue a start/stop.
-            // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
-            // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+                                                     forceWithShiftForces.shiftForces(), completionType, wcycle);
+
              if (isNbGpuDone)
              {
-                wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-                wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-
                  nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
                                                forceWithShiftForces.force());
              }
@@ -1394,13 +1386,12 @@ void do_force(FILE                                     *fplog,
          {
              if (bUseGPU)
              {
-                wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                            flags, Nbnxm::AtomLocality::NonLocal,
-                                            enerd->grpp.ener[egLJSR].data(),
-                                            enerd->grpp.ener[egCOULSR].data(),
-                                            forceWithShiftForces.shiftForces());
-                cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                                               flags, Nbnxm::AtomLocality::NonLocal,
+                                                               enerd->grpp.ener[egLJSR].data(),
+                                                               enerd->grpp.ener[egCOULSR].data(),
+                                                               forceWithShiftForces.shiftForces(),
+                                                               wcycle);
              }
              else
              {
@@ -1478,19 +1469,18 @@ void do_force(FILE                                     *fplog,
           * of the step time.
           */
          const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
-
-        wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-        Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                    flags, Nbnxm::AtomLocality::Local,
-                                    enerd->grpp.ener[egLJSR].data(),
-                                    enerd->grpp.ener[egCOULSR].data(),
-                                    forceOut.forceWithShiftForces().shiftForces());
-        float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+        const float waitCycles               =
+            Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                        flags, Nbnxm::AtomLocality::Local,
+                                        enerd->grpp.ener[egLJSR].data(),
+                                        enerd->grpp.ener[egCOULSR].data(),
+                                        forceOut.forceWithShiftForces().shiftForces(),
+                                        wcycle);
  
          if (ddBalanceRegionHandler.useBalancingRegion())
          {
              DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
-            if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
+            if (bDoForces && waitCycles <= gpuWaitApiOverheadMargin)
              {
                  /* We measured few cycles, it could be that the kernel
                   * and transfer finished earlier and there was no actual
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h

index 151dc4b3ff6fac2ef608762e41d153fec89b8443..3d7871c96900d91702d832559d3153270ca1d151 100644 (file)
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -62,6 +62,7 @@
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/timing/gpu_timing.h"
+#include "gromacs/timing/wallcycle.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/stringutil.h"
  
@@ -371,7 +372,8 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
                           real                     *e_lj,
                           real                     *e_el,
                           gmx::ArrayRef<gmx::RVec>  shiftForces,
-                         GpuTaskCompletion         completionKind)
+                         GpuTaskCompletion         completionKind,
+                         gmx_wallcycle            *wcycle)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
@@ -386,12 +388,22 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
          // Query the state of the GPU stream and return early if we're not done
          if (completionKind == GpuTaskCompletion::Check)
          {
+            // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
+            // we start without counting and only when the task finished we issue a
+            // start/stop to increment.
+            // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
+            wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+
              if (!haveStreamTasksCompleted(nb->stream[iLocality]))
              {
+                wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
                  // Early return to skip the steps below that we have to do only
                  // after the NB task completed
                  return false;
              }
+
+            wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
          }
          else
          {
@@ -431,17 +443,27 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
   * \param[out] e_lj Pointer to the LJ energy output to accumulate into
   * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
   * \param[out] shiftForces Shift forces buffer to accumulate into
+ * \param[out] wcycle Pointer to wallcycle data structure
+ * \return            The number of cycles the gpu wait took
   */
  //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
-void gpu_wait_finish_task(gmx_nbnxn_gpu_t          *nb,
-                          int                       flags,
-                          AtomLocality              aloc,
-                          real                     *e_lj,
-                          real                     *e_el,
-                          gmx::ArrayRef<gmx::RVec>  shiftForces)
+float gpu_wait_finish_task(gmx_nbnxn_gpu_t         *nb,
+                           int                      flags,
+                           AtomLocality             aloc,
+                           real                    *e_lj,
+                           real                    *e_el,
+                           gmx::ArrayRef<gmx::RVec> shiftForces,
+                           gmx_wallcycle           *wcycle)
  {
+    auto cycleCounter =
+        (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local) ? ewcWAIT_GPU_NB_L : ewcWAIT_GPU_NB_NL;
+
+    wallcycle_start(wcycle, cycleCounter);
      gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, shiftForces,
-                        GpuTaskCompletion::Wait);
+                        GpuTaskCompletion::Wait, wcycle);
+    float waitTime = wallcycle_stop(wcycle, cycleCounter);
+
+    return waitTime;
  }
  
  } // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/nbnxm_gpu.h b/src/gromacs/nbnxm/nbnxm_gpu.h

index 93d608347b830c21a02cd0fd51266ceed14c5747..0fbd9e4690a9b68647fa66df0c6cf4b6e5a0727f 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu.h
+++ b/src/gromacs/nbnxm/nbnxm_gpu.h
@@ -53,6 +53,7 @@
  #include "locality.h"
  
  struct nbnxn_atomdata_t;
+struct gmx_wallcycle;
  enum class GpuTaskCompletion;
  
  namespace gmx
@@ -165,6 +166,10 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
   *  - auxiliary tasks: updating the internal module state (timing accumulation, list pruning states) and
   *  - internal staging reduction of (\p fshift, \p e_el, \p e_lj).
   *
+ * In GpuTaskCompletion::Check mode this function does the timing and keeps correct count
+ * for the nonbonded task (incrementing only once per taks), in the GpuTaskCompletion::Wait mode
+ * timing is expected to be done in the caller.
+ *
   *  TODO: improve the handling of outputs e.g. by ensuring that this function explcitly returns the
   *  force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
   *  the energy and Fshift contributions for some external/centralized reduction.
@@ -176,16 +181,18 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
   * \param[out] e_el   Pointer to the electrostatics energy output to accumulate into
   * \param[out] shiftForces    Shift forces buffer to accumulate into
   * \param[in]  completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
+ * \param[out] wcycle Pointer to wallcycle data structure
   * \returns              True if the nonbonded tasks associated with \p aloc locality have completed
   */
  GPU_FUNC_QUALIFIER
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
-                         int                      gmx_unused  flags,
-                         AtomLocality             gmx_unused  aloc,
-                         real                     gmx_unused *e_lj,
-                         real                     gmx_unused *e_el,
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
+                         int             gmx_unused   flags,
+                         AtomLocality    gmx_unused   aloc,
+                         real            gmx_unused  *e_lj,
+                         real            gmx_unused  *e_el,
                           gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces,
-                         GpuTaskCompletion        gmx_unused  completionKind) GPU_FUNC_TERM_WITH_RETURN(false);
+                         GpuTaskCompletion gmx_unused completionKind,
+                         gmx_wallcycle    gmx_unused  *wcycle) GPU_FUNC_TERM_WITH_RETURN(false);
  
  /*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
   * transfers to finish.
@@ -202,12 +209,13 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
   * \param[out] shiftForces Shift forces buffer to accumulate into
   */
  GPU_FUNC_QUALIFIER
-void gpu_wait_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
-                          int                      gmx_unused  flags,
-                          AtomLocality             gmx_unused  aloc,
-                          real                     gmx_unused *e_lj,
-                          real                     gmx_unused *e_el,
-                          gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces) GPU_FUNC_TERM;
+float gpu_wait_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
+                           int             gmx_unused  flags,
+                           AtomLocality    gmx_unused  aloc,
+                           real            gmx_unused *e_lj,
+                           real            gmx_unused *e_el,
+                           gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces,
+                           gmx_wallcycle    gmx_unused  *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0);
  
  /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
  GPU_FUNC_QUALIFIER
diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp

index e558a4b4456b64a4e86e1ee0db9c63ac15fbaa6b..cf2320591ecc470ddb8f54e0a83adeb00b925106 100644 (file)
--- a/src/gromacs/timing/wallcycle.cpp
+++ b/src/gromacs/timing/wallcycle.cpp
@@ -312,6 +312,15 @@ void wallcycle_start(gmx_wallcycle_t wc, int ewc)
      }
  }
  
+void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
+{
+    if (wc == nullptr)
+    {
+        return;
+    }
+    wc->wcc[ewc].n++;
+}
+
  void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
  {
      if (wc == nullptr)
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h

index f2377ebe5ca787f0d0081d8fc415d8f4556b8298..ed1d6a59ec3aa9e340ef035f020cdf45aa4ce92f 100644 (file)
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -106,6 +106,9 @@ void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc);
  double wallcycle_stop(gmx_wallcycle_t wc, int ewc);
  /* Stop the cycle count for ewc, returns the last cycle count */
  
+void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc);
+/* Only increment call count for ewc by one */
+
  void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c);
  /* Returns the cumulative count and cycle count for ewc */
author	Szilárd Páll <pall.szilard@gmail.com>
	Mon, 29 Jul 2019 17:22:10 +0000 (19:22 +0200)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 28 Aug 2019 04:35:19 +0000 (06:35 +0200)
src/gromacs/mdlib/sim_util.cpp		patch \| blob \| history
src/gromacs/nbnxm/gpu_common.h		patch \| blob \| history
src/gromacs/nbnxm/nbnxm_gpu.h		patch \| blob \| history
src/gromacs/timing/wallcycle.cpp		patch \| blob \| history
src/gromacs/timing/wallcycle.h		patch \| blob \| history