GPU codepath cycle counting fixes
authorSzilárd Páll <pall.szilard@gmail.com>
Tue, 10 Nov 2020 12:33:23 +0000 (12:33 +0000)
committerArtem Zhmurov <zhmurov@gmail.com>
Tue, 10 Nov 2020 12:33:23 +0000 (12:33 +0000)
* eliminated double-counting in bonded module initialization
* moved the bonded kernel launch counting into the gpuBoned module
* avoid counting ~0 CPU update work on the GPU path by conditionally
starting the ewcUPDATE counter
* also added some minimal improvements to the wallcycle debugging
tooling which should facilitate finding invalid nesting and missing
stop calls.

Refs #3764

src/gromacs/listed_forces/gpubondedkernels.cu
src/gromacs/mdlib/sim_util.cpp
src/gromacs/mdrun/md.cpp
src/gromacs/timing/wallcycle.cpp

index d253241ee73b6ef0b54fe3147c3b19f973c88ece..d8ad851446599129236ddbeb948c6175d661a8cd 100644 (file)
@@ -61,6 +61,7 @@
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/simulation_workload.h"
 #include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/gmxassert.h"
 
 #include "gpubonded_impl.h"
@@ -826,6 +827,9 @@ void GpuBonded::Impl::launchKernel()
     GMX_ASSERT(haveInteractions_,
                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
 
+    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED);
+
     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 
     if (fTypeRangeEnd < 0)
@@ -839,6 +843,9 @@ void GpuBonded::Impl::launchKernel()
 
     launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr,
                     "exec_kernel_gpu<calcVir, calcEner>", kernelArgs);
+
+    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
+    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
 }
 
 void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
index 18606f120fb1f1320d85542e64a27dabcef9f8f7..9ef783b67a189d8096b22af45f2761754abee40a 100644 (file)
@@ -1341,11 +1341,12 @@ void do_force(FILE*                               fplog,
         /* initialize the GPU nbnxm atom data and bonded data structures */
         if (simulationWork.useGpuNonbonded)
         {
+            // Note: cycle counting only nononbondeds, gpuBonded counts internally
             wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-
             wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
             Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
 
             if (fr->gpuBonded)
             {
@@ -1361,7 +1362,6 @@ void do_force(FILE*                               fplog,
                         nbv->getGridIndices(), top->idef, Nbnxm::gpu_get_xq(nbv->gpu_nbv),
                         Nbnxm::gpu_get_f(nbv->gpu_nbv), Nbnxm::gpu_get_fshift(nbv->gpu_nbv));
             }
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
         }
 
         // Need to run after the GPU-offload bonded interaction lists
@@ -1415,7 +1415,6 @@ void do_force(FILE*                               fplog,
         ddBalanceRegionHandler.openBeforeForceComputationGpu();
 
         wallcycle_start(wcycle, ewcLAUNCH_GPU);
-
         wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
         if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
@@ -1423,18 +1422,18 @@ void do_force(FILE*                               fplog,
             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
         }
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
         // with X buffer ops offloaded to the GPU on all but the search steps
 
         // bonded work not split into separate local and non-local, so with DD
         // we can only launch the kernel after non-local coordinates have been received.
         if (domainWork.haveGpuBondedWork && !havePPDomainDecomposition(cr))
         {
-            wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
             fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
         }
 
         /* launch local nonbonded work on GPU */
+        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
         wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
         do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
         wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
@@ -1517,28 +1516,27 @@ void do_force(FILE*                               fplog,
 
         if (simulationWork.useGpuNonbonded)
         {
-            wallcycle_start(wcycle, ewcLAUNCH_GPU);
 
             if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
             {
+                wallcycle_start(wcycle, ewcLAUNCH_GPU);
                 wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
                 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
                 wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+                wallcycle_stop(wcycle, ewcLAUNCH_GPU);
             }
 
             if (domainWork.haveGpuBondedWork)
             {
-                wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_BONDED);
                 fr->gpuBonded->setPbcAndlaunchKernel(fr->pbcType, box, fr->bMolPBC, stepWork);
-                wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_BONDED);
             }
 
             /* launch non-local nonbonded tasks on GPU */
+            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
             wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
             do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step,
                          nrnb, wcycle);
             wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-
             wallcycle_stop(wcycle, ewcLAUNCH_GPU);
         }
     }
@@ -1713,8 +1711,10 @@ void do_force(FILE*                               fplog,
     // TODO Force flags should include haveFreeEnergyWork for this domain
     if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
     {
+        wallcycle_stop(wcycle, ewcFORCE);
         /* Wait for non-local coordinate data to be copied from device */
         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
+        wallcycle_start_nocount(wcycle, ewcFORCE);
     }
 
     // Compute wall interactions, when present.
index 507e2d0d91d3e9d5f840b3ae37b0df9a869636c6..792ca305502983e3a55edb91dee3fede79a45126 100644 (file)
@@ -1206,7 +1206,10 @@ void gmx::LegacySimulator::do_md()
 
         dvdl_constr = 0;
 
-        wallcycle_start(wcycle, ewcUPDATE);
+        if (!useGpuForUpdate)
+        {
+            wallcycle_start(wcycle, ewcUPDATE);
+        }
         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
         if (bTrotter)
         {
@@ -1256,8 +1259,6 @@ void gmx::LegacySimulator::do_md()
 
         if (useGpuForUpdate)
         {
-            wallcycle_stop(wcycle, ewcUPDATE);
-
             if (bNS && (bFirstStep || DOMAINDECOMP(cr)))
             {
                 integrator->set(stateGpu->getCoordinates(), stateGpu->getVelocities(),
index f1b428bf05ccaffc7691d4a36204cef54f5c919c..a80254755c4a149b363745bb98029b60e4861860 100644 (file)
 
 static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
 
-/* DEBUG_WCYCLE adds consistency checking for the counters.
- * It checks if you stop a counter different from the last
+#ifndef DEBUG_WCYCLE
+/*! \brief Enables consistency checking for the counters.
+ *
+ * If the macro is set to 1, code checks if you stop a counter different from the last
  * one that was opened and if you do nest too deep.
  */
-/* #define DEBUG_WCYCLE */
-
-#ifdef DEBUG_WCYCLE
+#    define DEBUG_WCYCLE 0
+#endif
+//! Whether wallcycle debugging is enabled
+constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
+//! True if only the master rank should print debugging output
+constexpr bool gmx_unused onlyMasterDebugPrints = true;
+//! True if cycle counter nesting depth debuggin prints are enabled
+constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
+
+#if DEBUG_WCYCLE
 #    include "gromacs/utility/fatalerror.h"
 #endif
 
@@ -86,10 +95,11 @@ struct gmx_wallcycle
     gmx_bool  wc_barrier;
     wallcc_t* wcc_all;
     int       wc_depth;
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
 #    define DEPTH_MAX 6
-    int counterlist[DEPTH_MAX];
-    int count_depth;
+    int  counterlist[DEPTH_MAX];
+    int  count_depth;
+    bool isMasterRank;
 #endif
     int          ewc_prev;
     gmx_cycles_t cycle_prev;
@@ -214,6 +224,7 @@ gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused*
     wc->ewc_prev         = -1;
     wc->reset_counters   = resetstep;
 
+
 #if GMX_MPI
     if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
     {
@@ -241,8 +252,9 @@ gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused*
         snew(wc->wcsc, ewcsNR);
     }
 
-#ifdef DEBUG_WCYCLE
-    wc->count_depth = 0;
+#if DEBUG_WCYCLE
+    wc->count_depth  = 0;
+    wc->isMasterRank = MASTER(cr);
 #endif
 
     return wc;
@@ -283,24 +295,32 @@ static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
 }
 
 
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
 static void debug_start_check(gmx_wallcycle_t wc, int ewc)
 {
-    /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
-
     if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
     {
-        gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
+        gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
     }
     wc->counterlist[wc->count_depth] = ewc;
     wc->count_depth++;
+
+    if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
+    {
+        std::string indentStr(4 * wc->count_depth, ' ');
+        fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
+    }
 }
 
 static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
 {
-    wc->count_depth--;
+    if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
+    {
+        std::string indentStr(4 * wc->count_depth, ' ');
+        fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
+    }
 
-    /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
+    wc->count_depth--;
 
     if (wc->count_depth < 0)
     {
@@ -331,7 +351,7 @@ void wallcycle_start(gmx_wallcycle_t wc, int ewc)
     }
 #endif
 
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
     debug_start_check(wc, ewc);
 #endif
 
@@ -387,7 +407,7 @@ double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
     }
 #endif
 
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
     debug_stop_check(wc, ewc);
 #endif