Move nonbonded GPU wait timing into the GPU module
authorSzilárd Páll <pall.szilard@gmail.com>
Mon, 29 Jul 2019 17:22:10 +0000 (19:22 +0200)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 28 Aug 2019 04:35:19 +0000 (06:35 +0200)
Extended the wallcycle module to allow only incrementing a counter --
used to simplify the polling GPU wait that would otherwise require a
regular start/stop to accomplish the same.

Also eliminating some TODOs.

Change-Id: I058c7213ab03d3fb5b438db58f69180049771454

src/gromacs/mdlib/sim_util.cpp
src/gromacs/nbnxm/gpu_common.h
src/gromacs/nbnxm/nbnxm_gpu.h
src/gromacs/timing/wallcycle.cpp
src/gromacs/timing/wallcycle.h

index 6a0885f9a25784b40dcfb60c4da970582c522d88..3a87eefb0ab0b7018f87f3d0ac49f146a08139a5 100644 (file)
@@ -682,23 +682,15 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t                  *nbv
         if (!isNbGpuDone)
         {
             GpuTaskCompletion completionType = (isPmeGpuDone) ? GpuTaskCompletion::Wait : GpuTaskCompletion::Check;
-            wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
             isNbGpuDone = Nbnxm::gpu_try_finish_task(nbv->gpu_nbv,
                                                      flags,
                                                      Nbnxm::AtomLocality::Local,
                                                      enerd->grpp.ener[egLJSR].data(),
                                                      enerd->grpp.ener[egCOULSR].data(),
-                                                     forceWithShiftForces.shiftForces(), completionType);
-            wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-            // To get the call count right, when the task finished we
-            // issue a start/stop.
-            // TODO: move the ewcWAIT_GPU_NB_L cycle counting into nbnxn_gpu_try_finish_task()
-            // and ewcNB_XF_BUF_OPS counting into nbnxn_atomdata_add_nbat_f_to_f().
+                                                     forceWithShiftForces.shiftForces(), completionType, wcycle);
+
             if (isNbGpuDone)
             {
-                wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-                wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
-
                 nbv->atomdata_add_nbat_f_to_f(Nbnxm::AtomLocality::Local,
                                               forceWithShiftForces.force());
             }
@@ -1394,13 +1386,12 @@ void do_force(FILE                                     *fplog,
         {
             if (bUseGPU)
             {
-                wallcycle_start(wcycle, ewcWAIT_GPU_NB_NL);
-                Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                            flags, Nbnxm::AtomLocality::NonLocal,
-                                            enerd->grpp.ener[egLJSR].data(),
-                                            enerd->grpp.ener[egCOULSR].data(),
-                                            forceWithShiftForces.shiftForces());
-                cycles_wait_gpu += wallcycle_stop(wcycle, ewcWAIT_GPU_NB_NL);
+                cycles_wait_gpu += Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                                               flags, Nbnxm::AtomLocality::NonLocal,
+                                                               enerd->grpp.ener[egLJSR].data(),
+                                                               enerd->grpp.ener[egCOULSR].data(),
+                                                               forceWithShiftForces.shiftForces(),
+                                                               wcycle);
             }
             else
             {
@@ -1478,19 +1469,18 @@ void do_force(FILE                                     *fplog,
          * of the step time.
          */
         const float gpuWaitApiOverheadMargin = 2e6F; /* cycles */
-
-        wallcycle_start(wcycle, ewcWAIT_GPU_NB_L);
-        Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
-                                    flags, Nbnxm::AtomLocality::Local,
-                                    enerd->grpp.ener[egLJSR].data(),
-                                    enerd->grpp.ener[egCOULSR].data(),
-                                    forceOut.forceWithShiftForces().shiftForces());
-        float cycles_tmp = wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+        const float waitCycles               =
+            Nbnxm::gpu_wait_finish_task(nbv->gpu_nbv,
+                                        flags, Nbnxm::AtomLocality::Local,
+                                        enerd->grpp.ener[egLJSR].data(),
+                                        enerd->grpp.ener[egCOULSR].data(),
+                                        forceOut.forceWithShiftForces().shiftForces(),
+                                        wcycle);
 
         if (ddBalanceRegionHandler.useBalancingRegion())
         {
             DdBalanceRegionWaitedForGpu waitedForGpu = DdBalanceRegionWaitedForGpu::yes;
-            if (bDoForces && cycles_tmp <= gpuWaitApiOverheadMargin)
+            if (bDoForces && waitCycles <= gpuWaitApiOverheadMargin)
             {
                 /* We measured few cycles, it could be that the kernel
                  * and transfer finished earlier and there was no actual
index 151dc4b3ff6fac2ef608762e41d153fec89b8443..3d7871c96900d91702d832559d3153270ca1d151 100644 (file)
@@ -62,6 +62,7 @@
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/pbcutil/ishift.h"
 #include "gromacs/timing/gpu_timing.h"
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/stringutil.h"
 
@@ -371,7 +372,8 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
                          real                     *e_lj,
                          real                     *e_el,
                          gmx::ArrayRef<gmx::RVec>  shiftForces,
-                         GpuTaskCompletion         completionKind)
+                         GpuTaskCompletion         completionKind,
+                         gmx_wallcycle            *wcycle)
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
@@ -386,12 +388,22 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
         // Query the state of the GPU stream and return early if we're not done
         if (completionKind == GpuTaskCompletion::Check)
         {
+            // To get the wcycle call count right, when in GpuTaskCompletion::Check mode,
+            // we start without counting and only when the task finished we issue a
+            // start/stop to increment.
+            // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
+            wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+
             if (!haveStreamTasksCompleted(nb->stream[iLocality]))
             {
+                wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+
                 // Early return to skip the steps below that we have to do only
                 // after the NB task completed
                 return false;
             }
+
+            wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
         }
         else
         {
@@ -431,17 +443,27 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          *nb,
  * \param[out] e_lj Pointer to the LJ energy output to accumulate into
  * \param[out] e_el Pointer to the electrostatics energy output to accumulate into
  * \param[out] shiftForces Shift forces buffer to accumulate into
+ * \param[out] wcycle Pointer to wallcycle data structure
+ * \return            The number of cycles the gpu wait took
  */
 //NOLINTNEXTLINE(misc-definitions-in-headers) TODO: move into source file
-void gpu_wait_finish_task(gmx_nbnxn_gpu_t          *nb,
-                          int                       flags,
-                          AtomLocality              aloc,
-                          real                     *e_lj,
-                          real                     *e_el,
-                          gmx::ArrayRef<gmx::RVec>  shiftForces)
+float gpu_wait_finish_task(gmx_nbnxn_gpu_t         *nb,
+                           int                      flags,
+                           AtomLocality             aloc,
+                           real                    *e_lj,
+                           real                    *e_el,
+                           gmx::ArrayRef<gmx::RVec> shiftForces,
+                           gmx_wallcycle           *wcycle)
 {
+    auto cycleCounter =
+        (gpuAtomToInteractionLocality(aloc) == InteractionLocality::Local) ? ewcWAIT_GPU_NB_L : ewcWAIT_GPU_NB_NL;
+
+    wallcycle_start(wcycle, cycleCounter);
     gpu_try_finish_task(nb, flags, aloc, e_lj, e_el, shiftForces,
-                        GpuTaskCompletion::Wait);
+                        GpuTaskCompletion::Wait, wcycle);
+    float waitTime = wallcycle_stop(wcycle, cycleCounter);
+
+    return waitTime;
 }
 
 } // namespace Nbnxm
index 93d608347b830c21a02cd0fd51266ceed14c5747..0fbd9e4690a9b68647fa66df0c6cf4b6e5a0727f 100644 (file)
@@ -53,6 +53,7 @@
 #include "locality.h"
 
 struct nbnxn_atomdata_t;
+struct gmx_wallcycle;
 enum class GpuTaskCompletion;
 
 namespace gmx
@@ -165,6 +166,10 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
  *  - auxiliary tasks: updating the internal module state (timing accumulation, list pruning states) and
  *  - internal staging reduction of (\p fshift, \p e_el, \p e_lj).
  *
+ * In GpuTaskCompletion::Check mode this function does the timing and keeps correct count
+ * for the nonbonded task (incrementing only once per taks), in the GpuTaskCompletion::Wait mode
+ * timing is expected to be done in the caller.
+ *
  *  TODO: improve the handling of outputs e.g. by ensuring that this function explcitly returns the
  *  force buffer (instead of that being passed only to nbnxn_gpu_launch_cpyback()) and by returning
  *  the energy and Fshift contributions for some external/centralized reduction.
@@ -176,16 +181,18 @@ void gpu_launch_cpyback(gmx_nbnxn_gpu_t  gmx_unused *nb,
  * \param[out] e_el   Pointer to the electrostatics energy output to accumulate into
  * \param[out] shiftForces    Shift forces buffer to accumulate into
  * \param[in]  completionKind Indicates whether nnbonded task completion should only be checked rather than waited for
+ * \param[out] wcycle Pointer to wallcycle data structure
  * \returns              True if the nonbonded tasks associated with \p aloc locality have completed
  */
 GPU_FUNC_QUALIFIER
-bool gpu_try_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
-                         int                      gmx_unused  flags,
-                         AtomLocality             gmx_unused  aloc,
-                         real                     gmx_unused *e_lj,
-                         real                     gmx_unused *e_el,
+bool gpu_try_finish_task(gmx_nbnxn_gpu_t gmx_unused  *nb,
+                         int             gmx_unused   flags,
+                         AtomLocality    gmx_unused   aloc,
+                         real            gmx_unused  *e_lj,
+                         real            gmx_unused  *e_el,
                          gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces,
-                         GpuTaskCompletion        gmx_unused  completionKind) GPU_FUNC_TERM_WITH_RETURN(false);
+                         GpuTaskCompletion gmx_unused completionKind,
+                         gmx_wallcycle    gmx_unused  *wcycle) GPU_FUNC_TERM_WITH_RETURN(false);
 
 /*! \brief  Completes the nonbonded GPU task blocking until GPU tasks and data
  * transfers to finish.
@@ -202,12 +209,13 @@ bool gpu_try_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
  * \param[out] shiftForces Shift forces buffer to accumulate into
  */
 GPU_FUNC_QUALIFIER
-void gpu_wait_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
-                          int                      gmx_unused  flags,
-                          AtomLocality             gmx_unused  aloc,
-                          real                     gmx_unused *e_lj,
-                          real                     gmx_unused *e_el,
-                          gmx::ArrayRef<gmx::RVec> gmx_unused  shiftForces) GPU_FUNC_TERM;
+float gpu_wait_finish_task(gmx_nbnxn_gpu_t          gmx_unused *nb,
+                           int             gmx_unused  flags,
+                           AtomLocality    gmx_unused  aloc,
+                           real            gmx_unused *e_lj,
+                           real            gmx_unused *e_el,
+                           gmx::ArrayRef<gmx::RVec> gmx_unused shiftForces,
+                           gmx_wallcycle    gmx_unused  *wcycle) GPU_FUNC_TERM_WITH_RETURN(0.0);
 
 /*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
 GPU_FUNC_QUALIFIER
index e558a4b4456b64a4e86e1ee0db9c63ac15fbaa6b..cf2320591ecc470ddb8f54e0a83adeb00b925106 100644 (file)
@@ -312,6 +312,15 @@ void wallcycle_start(gmx_wallcycle_t wc, int ewc)
     }
 }
 
+void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
+{
+    if (wc == nullptr)
+    {
+        return;
+    }
+    wc->wcc[ewc].n++;
+}
+
 void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
 {
     if (wc == nullptr)
index f2377ebe5ca787f0d0081d8fc415d8f4556b8298..ed1d6a59ec3aa9e340ef035f020cdf45aa4ce92f 100644 (file)
@@ -106,6 +106,9 @@ void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc);
 double wallcycle_stop(gmx_wallcycle_t wc, int ewc);
 /* Stop the cycle count for ewc, returns the last cycle count */
 
+void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc);
+/* Only increment call count for ewc by one */
+
 void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c);
 /* Returns the cumulative count and cycle count for ewc */