Remove/replace many mentions of Jenkins
[alexxy/gromacs.git] / src / gromacs / timing / wallcycle.cpp
index 6195dadd27e32cb5ac49df70153e4c76c7b1b6a1..1f1d7ca38ea2eec6b6ac4f68f2505d530ed0e163 100644 (file)
@@ -3,7 +3,8 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2008, The GROMACS development team.
- * Copyright (c) 2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
 #include <cstdlib>
 
 #include <array>
+#include <memory>
 #include <vector>
 
+#include "gromacs/math/functions.h"
 #include "gromacs/mdtypes/commrec.h"
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/timing/wallcyclereporting.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/logger.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/snprintf.h"
+#include "gromacs/utility/stringutil.h"
 
-static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
+//! Whether wallcycle debugging is enabled
+constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
+//! True if only the master rank should print debugging output
+constexpr bool gmx_unused onlyMasterDebugPrints = true;
+//! True if cycle counter nesting depth debuggin prints are enabled
+constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
 
-/* DEBUG_WCYCLE adds consistency checking for the counters.
- * It checks if you stop a counter different from the last
- * one that was opened and if you do nest too deep.
- */
-/* #define DEBUG_WCYCLE */
-
-#ifdef DEBUG_WCYCLE
-#include "gromacs/utility/fatalerror.h"
-#endif
-
-typedef struct
-{
-    int          n;
-    gmx_cycles_t c;
-    gmx_cycles_t start;
-} wallcc_t;
-
-typedef struct gmx_wallcycle
-{
-    wallcc_t        *wcc;
-    /* did we detect one or more invalid cycle counts */
-    gmx_bool         haveInvalidCount;
-    /* variables for testing/debugging */
-    gmx_bool         wc_barrier;
-    wallcc_t        *wcc_all;
-    int              wc_depth;
-#ifdef DEBUG_WCYCLE
-#define DEPTH_MAX 6
-    int               counterlist[DEPTH_MAX];
-    int               count_depth;
-#endif
-    int               ewc_prev;
-    gmx_cycles_t      cycle_prev;
-    gmx_int64_t       reset_counters;
-#if GMX_MPI
-    MPI_Comm          mpi_comm_mygroup;
+#if DEBUG_WCYCLE
+#    include "gromacs/utility/fatalerror.h"
 #endif
-    wallcc_t         *wcsc;
-} gmx_wallcycle_t_t;
 
 /* Each name should not exceed 19 printing characters
    (ie. terminating null can be twentieth) */
-static const char *wcn[ewcNR] =
+static const char* enumValuetoString(WallCycleCounter enumValue)
 {
-    "Run", "Step", "PP during PME", "Domain decomp.", "DD comm. load",
-    "DD comm. bounds", "Vsite constr.", "Send X to PME", "Neighbor search", "Launch GPU ops.",
-    "Comm. coord.", "Born radii", "Force", "Wait + Comm. F", "PME mesh",
-    "PME redist. X/F", "PME spread", "PME gather", "PME 3D-FFT", "PME 3D-FFT Comm.", "PME solve LJ", "PME solve Elec",
-    "PME wait for PP", "Wait + Recv. PME F",
-    "Wait PME GPU spread", "PME 3D-FFT", "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
-    "Wait PME GPU gather", "Reduce GPU PME F",
-    "Wait GPU NB nonloc.", "Wait GPU NB local", "NB X/F buffer ops.",
-    "Vsite spread", "COM pull force", "AWH",
-    "Write traj.", "Update", "Constraints", "Comm. energies",
-    "Enforced rotation", "Add rot. forces", "Position swapping", "IMD", "Test"
-};
+    constexpr gmx::EnumerationArray<WallCycleCounter, const char*> wallCycleCounterNames = {
+        "Run",
+        "Step",
+        "PP during PME",
+        "Domain decomp.",
+        "DD comm. load",
+        "DD comm. bounds",
+        "Vsite constr.",
+        "Send X to PME",
+        "Neighbor search",
+        "Launch GPU ops.",
+        "Comm. coord.",
+        "Force",
+        "Wait + Comm. F",
+        "PME mesh",
+        "PME redist. X/F",
+        "PME spread",
+        "PME gather",
+        "PME 3D-FFT",
+        "PME 3D-FFT Comm.",
+        "PME solve LJ",
+        "PME solve Elec",
+        "PME wait for PP",
+        "Wait + Recv. PME F",
+        "Wait PME GPU spread",
+        "PME 3D-FFT",
+        "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
+        "Wait PME GPU gather",
+        "Wait Bonded GPU",
+        "Reduce GPU PME F",
+        "Wait GPU NB nonloc.",
+        "Wait GPU NB local",
+        "Wait GPU state copy",
+        "NB X/F buffer ops.",
+        "Vsite spread",
+        "COM pull force",
+        "AWH",
+        "Write traj.",
+        "Update",
+        "Constraints",
+        "Comm. energies",
+        "Enforced rotation",
+        "Add rot. forces",
+        "Position swapping",
+        "IMD",
+        "Test"
+    };
+    return wallCycleCounterNames[enumValue];
+}
 
-static const char *wcsn[ewcsNR] =
+static const char* enumValuetoString(WallCycleSubCounter enumValue)
 {
-    "DD redist.", "DD NS grid + sort", "DD setup comm.",
-    "DD make top.", "DD make constr.", "DD top. other",
-    "NS grid local", "NS grid non-loc.", "NS search local", "NS search non-loc.",
-    "Bonded F",
-    "Bonded-FEP F",
-    "Restraints F",
-    "Listed buffer ops.",
-    "Nonbonded pruning",
-    "Nonbonded F",
-    "Launch NB GPU tasks",
-    "Launch PME GPU tasks",
-    "Ewald F correction",
-    "NB X buffer ops.",
-    "NB F buffer ops.",
-};
+    constexpr gmx::EnumerationArray<WallCycleSubCounter, const char*> wallCycleSubCounterNames = {
+        "DD redist.",
+        "DD NS grid + sort",
+        "DD setup comm.",
+        "DD make top.",
+        "DD make constr.",
+        "DD top. other",
+        "DD GPU ops.",
+        "NS grid local",
+        "NS grid non-loc.",
+        "NS search local",
+        "NS search non-loc.",
+        "Bonded F",
+        "Bonded-FEP F",
+        "Restraints F",
+        "Listed buffer ops.",
+        "Nonbonded pruning",
+        "Nonbonded F kernel",
+        "Nonbonded F clear",
+        "Nonbonded FEP",
+        "Nonbonded FEP reduction",
+        "Launch NB GPU tasks",
+        "Launch Bonded GPU tasks",
+        "Launch PME GPU tasks",
+        "Launch state copy",
+        "Ewald F correction",
+        "NB X buffer ops.",
+        "NB F buffer ops.",
+        "Clear force buffer",
+        "Launch GPU NB X buffer ops.",
+        "Launch GPU NB F buffer ops.",
+        "Launch GPU Comm. coord.",
+        "Launch GPU Comm. force.",
+        "Launch GPU update",
+        "Test subcounter"
+    };
+    return wallCycleSubCounterNames[enumValue];
+}
 
 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
-static const char *PMEStageNames[] =
+static const char* enumValuetoString(PmeStage enumValue)
 {
-    "PME spline",
-    "PME spread",
-    "PME spline + spread",
-    "PME 3D-FFT r2c",
-    "PME solve",
-    "PME 3D-FFT c2r",
-    "PME gather",
+    constexpr gmx::EnumerationArray<PmeStage, const char*> pmeStageNames = {
+        "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
+        "PME solve",  "PME 3D-FFT c2r", "PME gather"
+    };
+    return pmeStageNames[enumValue];
 };
 
-gmx_bool wallcycle_have_counter(void)
+bool wallcycle_have_counter()
 {
     return gmx_cycles_have_counter();
 }
 
-gmx_wallcycle_t wallcycle_init(FILE *fplog, int resetstep, t_commrec gmx_unused *cr)
+std::unique_ptr<gmx_wallcycle> wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
 {
-    gmx_wallcycle_t wc;
+    std::unique_ptr<gmx_wallcycle> wc;
 
 
     if (!wallcycle_have_counter())
     {
-        return nullptr;
+        return wc;
     }
 
-    snew(wc, 1);
+    wc = std::make_unique<gmx_wallcycle>();
+
+    wc->haveInvalidCount = false;
+    wc->wc_barrier       = false;
+    wc->wc_depth         = 0;
+    wc->ewc_prev         = WallCycleCounter::Count;
+    wc->reset_counters   = resetstep;
+    wc->cr               = cr;
 
-    wc->haveInvalidCount    = FALSE;
-    wc->wc_barrier          = FALSE;
-    wc->wcc_all             = nullptr;
-    wc->wc_depth            = 0;
-    wc->ewc_prev            = -1;
-    wc->reset_counters      = resetstep;
 
 #if GMX_MPI
-    if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
+    if (cr != nullptr && PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
     {
         if (fplog)
         {
             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
         }
-        wc->wc_barrier       = TRUE;
-        wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
+        wc->wc_barrier = true;
     }
 #endif
 
-    snew(wc->wcc, ewcNR);
     if (getenv("GMX_CYCLE_ALL") != nullptr)
     {
         if (fplog)
         {
             fprintf(fplog, "\nWill time all the code during the run\n\n");
         }
-        snew(wc->wcc_all, ewcNR*ewcNR);
+        wc->wcc_all.resize(sc_numWallCycleCountersSquared);
     }
 
-    if (useCycleSubcounters)
-    {
-        snew(wc->wcsc, ewcsNR);
-    }
-
-#ifdef DEBUG_WCYCLE
-    wc->count_depth = 0;
+#if DEBUG_WCYCLE
+    wc->count_depth  = 0;
+    wc->isMasterRank = MASTER(cr);
 #endif
 
     return wc;
 }
 
-/* TODO: Should be called from finish_run() or runner()
-   void wallcycle_destroy(gmx_wallcycle_t wc)
-   {
-    if (wc == nullptr)
+#if DEBUG_WCYCLE
+static void debug_start_check(gmx_wallcycle* wc, WallCycleCounter ewc)
+{
+    if (wc->count_depth < 0 || wc->count_depth >= c_MaxWallCycleDepth)
     {
-        return;
+        gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
     }
+    wc->counterlist[wc->count_depth] = ewc;
+    wc->count_depth++;
 
-    if (wc->wcc != nullptr)
+    if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
     {
-        sfree(wc->wcc);
+        std::string indentStr(4 * wc->count_depth, ' ');
+        fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
     }
-    if (wc->wcc_all != nullptr)
-    {
-        sfree(wc->wcc_all);
-    }
-    if (wc->wcsc != nullptr)
-    {
-        sfree(wc->wcsc);
-    }
-    sfree(wc);
-   }
- */
-
-static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
-{
-    wc->ewc_prev   = ewc;
-    wc->cycle_prev = cycle;
 }
 
-static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
+static void debug_stop_check(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
-    wc->wcc_all[wc->ewc_prev*ewcNR+ewc].n += 1;
-    wc->wcc_all[wc->ewc_prev*ewcNR+ewc].c += cycle - wc->cycle_prev;
-}
-
-
-#ifdef DEBUG_WCYCLE
-static void debug_start_check(gmx_wallcycle_t wc, int ewc)
-{
-    /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
-
-    if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
+    if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
     {
-        gmx_fatal(FARGS, "wallcycle counter depth out of range: %d",
-                  wc->count_depth);
+        std::string indentStr(4 * wc->count_depth, ' ');
+        fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
     }
-    wc->counterlist[wc->count_depth] = ewc;
-    wc->count_depth++;
-}
 
-static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
-{
     wc->count_depth--;
 
-    /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
-
     if (wc->count_depth < 0)
     {
-        gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
+        gmx_fatal(FARGS,
+                  "wallcycle counter depth out of range when stopping %s: %d",
+                  enumValuetoString(ewc),
+                  wc->count_depth);
     }
     if (wc->counterlist[wc->count_depth] != ewc)
     {
-        gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
-                  wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
+        gmx_fatal(FARGS,
+                  "wallcycle mismatch at stop, start %s, stop %s",
+                  enumValuetoString(wc->counterlist[wc->count_depth]),
+                  enumValuetoString(ewc));
     }
 }
 #endif
 
-void wallcycle_start(gmx_wallcycle_t wc, int ewc)
+void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c)
 {
-    gmx_cycles_t cycle;
-
-    if (wc == nullptr)
-    {
-        return;
-    }
-
-#if GMX_MPI
-    if (wc->wc_barrier)
-    {
-        MPI_Barrier(wc->mpi_comm_mygroup);
-    }
-#endif
-
-#ifdef DEBUG_WCYCLE
-    debug_start_check(wc, ewc);
-#endif
-
-    cycle              = gmx_cycles_read();
-    wc->wcc[ewc].start = cycle;
-    if (wc->wcc_all != nullptr)
-    {
-        wc->wc_depth++;
-        if (ewc == ewcRUN)
-        {
-            wallcycle_all_start(wc, ewc, cycle);
-        }
-        else if (wc->wc_depth == 3)
-        {
-            wallcycle_all_stop(wc, ewc, cycle);
-        }
-    }
+    *n = wc->wcc[ewc].n;
+    *c = static_cast<double>(wc->wcc[ewc].c);
 }
 
-void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
+void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c)
 {
-    if (wc == nullptr)
+    if (sc_useCycleSubcounters && wc != nullptr)
     {
-        return;
+        *n = wc->wcsc[ewcs].n;
+        *c = static_cast<double>(wc->wcsc[ewcs].c);
     }
-
-    wallcycle_start(wc, ewc);
-    wc->wcc[ewc].n--;
 }
 
-double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
+void wallcycle_reset_all(gmx_wallcycle* wc)
 {
-    gmx_cycles_t cycle, last;
-
-    if (wc == nullptr)
-    {
-        return 0;
-    }
-
-#if GMX_MPI
-    if (wc->wc_barrier)
-    {
-        MPI_Barrier(wc->mpi_comm_mygroup);
-    }
-#endif
-
-#ifdef DEBUG_WCYCLE
-    debug_stop_check(wc, ewc);
-#endif
-
-    /* When processes or threads migrate between cores, the cycle counting
-     * can get messed up if the cycle counter on different cores are not
-     * synchronized. When this happens we expect both large negative and
-     * positive cycle differences. We can detect negative cycle differences.
-     * Detecting too large positive counts if difficult, since count can be
-     * large, especially for ewcRUN. If we detect a negative count,
-     * we will not print the cycle accounting table.
-     */
-    cycle                    = gmx_cycles_read();
-    if (cycle >= wc->wcc[ewc].start)
-    {
-        last                 = cycle - wc->wcc[ewc].start;
-    }
-    else
-    {
-        last                 = 0;
-        wc->haveInvalidCount = TRUE;
-    }
-    wc->wcc[ewc].c          += last;
-    wc->wcc[ewc].n++;
-    if (wc->wcc_all)
-    {
-        wc->wc_depth--;
-        if (ewc == ewcRUN)
-        {
-            wallcycle_all_stop(wc, ewc, cycle);
-        }
-        else if (wc->wc_depth == 2)
-        {
-            wallcycle_all_start(wc, ewc, cycle);
-        }
-    }
-
-    return last;
-}
-
-void wallcycle_get(gmx_wallcycle_t wc, int ewc, int *n, double *c)
-{
-    *n = wc->wcc[ewc].n;
-    *c = static_cast<double>(wc->wcc[ewc].c);
-}
-
-void wallcycle_reset_all(gmx_wallcycle_t wc)
-{
-    int i;
-
     if (wc == nullptr)
     {
         return;
     }
 
-    for (i = 0; i < ewcNR; i++)
+    for (auto& counter : wc->wcc)
     {
-        wc->wcc[i].n = 0;
-        wc->wcc[i].c = 0;
+        counter.n = 0;
+        counter.c = 0;
     }
-    wc->haveInvalidCount = FALSE;
+    wc->haveInvalidCount = false;
 
-    if (wc->wcc_all)
+    if (!wc->wcc_all.empty())
     {
-        for (i = 0; i < ewcNR*ewcNR; i++)
+        for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
         {
             wc->wcc_all[i].n = 0;
             wc->wcc_all[i].c = 0;
         }
     }
-    if (wc->wcsc)
+    for (auto& counter : wc->wcsc)
     {
-        for (i = 0; i < ewcsNR; i++)
-        {
-            wc->wcsc[i].n = 0;
-            wc->wcsc[i].c = 0;
-        }
+        counter.n = 0;
+        counter.c = 0;
     }
 }
 
-static gmx_bool is_pme_counter(int ewc)
+static bool is_pme_counter(WallCycleCounter ewc)
 {
-    return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
+    return (ewc >= WallCycleCounter::PmeMesh && ewc <= WallCycleCounter::PmeWaitComm);
 }
 
-static gmx_bool is_pme_subcounter(int ewc)
+static bool is_pme_subcounter(WallCycleCounter ewc)
 {
-    return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
+    return (ewc >= WallCycleCounter::PmeRedistXF && ewc < WallCycleCounter::PmeWaitComm);
+}
+
+void wallcycleBarrier(gmx_wallcycle* wc)
+{
+#if GMX_MPI
+    if (wc->wc_barrier)
+    {
+        MPI_Barrier(wc->cr->mpi_comm_mygroup);
+    }
+#else
+    GMX_UNUSED_VALUE(wc);
+#endif
 }
 
 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
-static void subtract_cycles(wallcc_t *wcc, int ewc_main, int ewc_sub)
+// NOLINTNEXTLINE(google-runtime-references)
+static void subtract_cycles(gmx::EnumerationArray<WallCycleCounter, wallcc_t>& wcc,
+                            WallCycleCounter                                   ewc_main,
+                            WallCycleCounter                                   ewc_sub)
 {
     if (wcc[ewc_sub].n > 0)
     {
@@ -437,50 +356,52 @@ static void subtract_cycles(wallcc_t *wcc, int ewc_main, int ewc_sub)
         else
         {
             /* Something is wrong with the cycle counting */
-            wcc[ewc_main].c  = 0;
+            wcc[ewc_main].c = 0;
         }
     }
 }
 
-void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
+void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 {
     if (wc == nullptr)
     {
         return;
     }
 
-    for (int i = 0; i < ewcNR; i++)
+    for (auto key : keysOf(wc->wcc))
     {
-        if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
+        if (is_pme_counter(key) || (key == WallCycleCounter::Run && isPmeRank))
         {
-            wc->wcc[i].c *= nthreads_pme;
+            wc->wcc[key].c *= nthreads_pme;
 
-            if (wc->wcc_all)
+            if (!wc->wcc_all.empty())
             {
-                for (int j = 0; j < ewcNR; j++)
+                const int current = static_cast<int>(key);
+                for (int j = 0; j < sc_numWallCycleCounters; j++)
                 {
-                    wc->wcc_all[i*ewcNR+j].c *= nthreads_pme;
+                    wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pme;
                 }
             }
         }
         else
         {
-            wc->wcc[i].c *= nthreads_pp;
+            wc->wcc[key].c *= nthreads_pp;
 
-            if (wc->wcc_all)
+            if (!wc->wcc_all.empty())
             {
-                for (int j = 0; j < ewcNR; j++)
+                const int current = static_cast<int>(key);
+                for (int j = 0; j < sc_numWallCycleCounters; j++)
                 {
-                    wc->wcc_all[i*ewcNR+j].c *= nthreads_pp;
+                    wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pp;
                 }
             }
         }
     }
-    if (useCycleSubcounters && wc->wcsc && !isPmeRank)
+    if (sc_useCycleSubcounters && !isPmeRank)
     {
-        for (int i = 0; i < ewcsNR; i++)
+        for (auto& counter : wc->wcsc)
         {
-            wc->wcsc[i].c *= nthreads_pp;
+            counter.c *= nthreads_pp;
         }
     }
 }
@@ -497,135 +418,154 @@ void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthr
  * wcc_all are unused by the GPU reporting, but it is not satisfactory
  * for the future. Also, there's no need for MPI_Allreduce, since
  * only MASTERRANK uses any of the results. */
-WallcycleCounts wallcycle_sum(t_commrec *cr, gmx_wallcycle_t wc)
+WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc)
 {
-    WallcycleCounts cycles_sum;
-    wallcc_t       *wcc;
-    double          cycles[ewcNR+ewcsNR];
+    WallcycleCounts                                    cycles_sum;
+    gmx::EnumerationArray<WallCycleCounter, double>    cyclesMain;
+    gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSub;
 #if GMX_MPI
-    double          cycles_n[ewcNR+ewcsNR+1];
+    gmx::EnumerationArray<WallCycleCounter, double>    cyclesMainOnNode;
+    gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSubOnNode;
 #endif
-    int             i;
-    int             nsum;
 
     if (wc == nullptr)
     {
         /* Default construction of std::array of non-class T can leave
-           the values indeterminate, just like a C array, and icc
-           warns about it. */
+           the values indeterminate, just like a C array */
         cycles_sum.fill(0);
         return cycles_sum;
     }
 
-    wcc = wc->wcc;
+    auto& wcc = wc->wcc;
 
-    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
-    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
+    subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommLoad);
+    subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommBound);
 
-    subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
+    subtract_cycles(wcc, WallCycleCounter::PmeFft, WallCycleCounter::PmeFftComm);
 
     if (cr->npmenodes == 0)
     {
         /* All nodes do PME (or no PME at all) */
-        subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
+        subtract_cycles(wcc, WallCycleCounter::Force, WallCycleCounter::PmeMesh);
     }
     else
     {
         /* The are PME-only nodes */
-        if (wcc[ewcPMEMESH].n > 0)
+        if (wcc[WallCycleCounter::PmeMesh].n > 0)
         {
             /* This must be a PME only node, calculate the Wait + Comm. time */
-            GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c, "Total run ticks must be greater than PME-only ticks");
-            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
+            GMX_ASSERT(wcc[WallCycleCounter::Run].c >= wcc[WallCycleCounter::PmeMesh].c,
+                       "Total run ticks must be greater than PME-only ticks");
+            wcc[WallCycleCounter::PmeWaitComm].c =
+                    wcc[WallCycleCounter::Run].c - wcc[WallCycleCounter::PmeMesh].c;
         }
     }
 
     /* Store the cycles in a double buffer for summing */
-    for (i = 0; i < ewcNR; i++)
+    for (auto key : keysOf(wcc))
     {
 #if GMX_MPI
-        cycles_n[i] = static_cast<double>(wcc[i].n);
+        cyclesMainOnNode[key] = static_cast<double>(wcc[key].n);
 #endif
-        cycles[i]   = static_cast<double>(wcc[i].c);
+        cyclesMain[key] = static_cast<double>(wcc[key].c);
     }
-    nsum = ewcNR;
-    if (wc->wcsc)
+    if (sc_useCycleSubcounters)
     {
-        for (i = 0; i < ewcsNR; i++)
+        for (auto key : keysOf(wc->wcsc))
         {
 #if GMX_MPI
-            cycles_n[ewcNR+i] = static_cast<double>(wc->wcsc[i].n);
+            cyclesSubOnNode[key] = static_cast<double>(wc->wcsc[key].n);
 #endif
-            cycles[ewcNR+i]   = static_cast<double>(wc->wcsc[i].c);
+            cyclesSub[key] = static_cast<double>(wc->wcsc[key].c);
         }
-        nsum += ewcsNR;
     }
 
 #if GMX_MPI
     if (cr->nnodes > 1)
     {
-        double buf[ewcNR+ewcsNR+1];
+        gmx::EnumerationArray<WallCycleCounter, double>    bufMain;
+        gmx::EnumerationArray<WallCycleSubCounter, double> bufSub;
 
         // TODO this code is used only at the end of the run, so we
         // can just do a simple reduce of haveInvalidCount in
         // wallcycle_print, and avoid bugs
-        cycles_n[nsum] = (wc->haveInvalidCount > 0 ? 1 : 0);
+        double haveInvalidCount = (wc->haveInvalidCount ? 1 : 0);
         // TODO Use MPI_Reduce
-        MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX,
-                      cr->mpi_comm_mysim);
-        for (i = 0; i < ewcNR; i++)
+        MPI_Allreduce(cyclesMainOnNode.data(), bufMain.data(), bufMain.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+        if (sc_useCycleSubcounters)
         {
-            wcc[i].n = static_cast<int>(buf[i] + 0.5);
+            MPI_Allreduce(cyclesSubOnNode.data(), bufSub.data(), bufSub.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
         }
-        wc->haveInvalidCount = (buf[nsum] > 0);
-        if (wc->wcsc)
+        MPI_Allreduce(MPI_IN_PLACE, &haveInvalidCount, 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+        for (auto key : keysOf(wcc))
         {
-            for (i = 0; i < ewcsNR; i++)
+            wcc[key].n = gmx::roundToInt(bufMain[key]);
+        }
+        wc->haveInvalidCount = (haveInvalidCount > 0);
+        if (sc_useCycleSubcounters)
+        {
+            for (auto key : keysOf(wc->wcsc))
             {
-                wc->wcsc[i].n = static_cast<int>(buf[ewcNR+i] + 0.5);
+                wc->wcsc[key].n = gmx::roundToInt(bufSub[key]);
             }
         }
 
         // TODO Use MPI_Reduce
-        MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM,
-                      cr->mpi_comm_mysim);
+        MPI_Allreduce(cyclesMain.data(), cycles_sum.data(), cyclesMain.size(), MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+        if (sc_useCycleSubcounters)
+        {
+            MPI_Allreduce(cyclesSub.data(),
+                          cycles_sum.data() + sc_numWallCycleCounters,
+                          cyclesSub.size(),
+                          MPI_DOUBLE,
+                          MPI_SUM,
+                          cr->mpi_comm_mysim);
+        }
 
-        if (wc->wcc_all != nullptr)
+        if (!wc->wcc_all.empty())
         {
-            double *buf_all, *cyc_all;
+            std::array<double, sc_numWallCycleCountersSquared> cyc_all;
+            std::array<double, sc_numWallCycleCountersSquared> buf_all;
 
-            snew(cyc_all, ewcNR*ewcNR);
-            snew(buf_all, ewcNR*ewcNR);
-            for (i = 0; i < ewcNR*ewcNR; i++)
+            for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
             {
                 cyc_all[i] = wc->wcc_all[i].c;
             }
             // TODO Use MPI_Reduce
-            MPI_Allreduce(cyc_all, buf_all, ewcNR*ewcNR, MPI_DOUBLE, MPI_SUM,
+            MPI_Allreduce(cyc_all.data(),
+                          buf_all.data(),
+                          sc_numWallCycleCountersSquared,
+                          MPI_DOUBLE,
+                          MPI_SUM,
                           cr->mpi_comm_mysim);
-            for (i = 0; i < ewcNR*ewcNR; i++)
+            for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
             {
                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
             }
-            sfree(buf_all);
-            sfree(cyc_all);
         }
     }
     else
 #endif
     {
-        for (i = 0; i < nsum; i++)
+        for (auto key : keysOf(cyclesMain))
         {
-            cycles_sum[i] = cycles[i];
+            cycles_sum[static_cast<int>(key)] = cyclesMain[key];
+        }
+        if (sc_useCycleSubcounters)
+        {
+            for (auto key : keysOf(cyclesSub))
+            {
+                const int offset   = static_cast<int>(key) + sc_numWallCycleCounters;
+                cycles_sum[offset] = cyclesSub[key];
+            }
         }
     }
 
     return cycles_sum;
 }
 
-static void print_cycles(FILE *fplog, double c2t, const char *name,
-                         int nnodes, int nthreads,
-                         int ncalls, double c_sum, double tot)
+static void
+print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads, int ncalls, double c_sum, double tot)
 {
     char   nnodes_str[STRLEN];
     char   nthreads_str[STRLEN];
@@ -662,16 +602,21 @@ static void print_cycles(FILE *fplog, double c2t, const char *name,
             ncalls_str[0]   = 0;
         }
         /* Convert the cycle count to wallclock time for this task */
-        wallt = c_sum*c2t;
+        wallt = c_sum * c2t;
 
-        fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
-                name, nnodes_str, nthreads_str, ncalls_str, wallt,
-                c_sum*1e-9, percentage);
+        fprintf(fplog,
+                " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
+                name,
+                nnodes_str,
+                nthreads_str,
+                ncalls_str,
+                wallt,
+                c_sum * 1e-9,
+                percentage);
     }
 }
 
-static void print_gputimes(FILE *fplog, const char *name,
-                           int n, double t, double tot_t)
+static void print_gputimes(FILE* fplog, const char* name, int n, double t, double tot_t)
 {
     char num[11];
     char avg_perf[11];
@@ -679,7 +624,7 @@ static void print_gputimes(FILE *fplog, const char *name,
     if (n > 0)
     {
         snprintf(num, sizeof(num), "%10d", n);
-        snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t/n);
+        snprintf(avg_perf, sizeof(avg_perf), "%10.3f", t / n);
     }
     else
     {
@@ -688,17 +633,15 @@ static void print_gputimes(FILE *fplog, const char *name,
     }
     if (t != tot_t && tot_t > 0)
     {
-        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
-                name, num, t/1000, avg_perf, 100 * t/tot_t);
+        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, num, t / 1000, avg_perf, 100 * t / tot_t);
     }
     else
     {
-        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n",
-                name, "", t/1000, avg_perf, 100.0);
+        fprintf(fplog, " %-29s %10s%12.3f   %s   %5.1f\n", name, "", t / 1000, avg_perf, 100.0);
     }
 }
 
-static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
+static void print_header(FILEfplog, int nrank_pp, int nth_pp, int nrank_pme, int nth_pme)
 {
     int nrank_tot = nrank_pp + nrank_pme;
     if (0 == nrank_pme)
@@ -731,17 +674,24 @@ static void print_header(FILE *fplog, int nrank_pp, int nth_pp, int nrank_pme, i
 }
 
 
-void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int npme,
-                     int nth_pp, int nth_pme, double realtime,
-                     gmx_wallcycle_t wc, const WallcycleCounts &cyc_sum,
-                     const gmx_wallclock_gpu_nbnxn_t *gpu_nbnxn_t,
-                     const gmx_wallclock_gpu_pme_t *gpu_pme_t)
+void wallcycle_print(FILE*                            fplog,
+                     const gmx::MDLogger&             mdlog,
+                     int                              nnodes,
+                     int                              npme,
+                     int                              nth_pp,
+                     int                              nth_pme,
+                     double                           realtime,
+                     gmx_wallcycle*                   wc,
+                     const WallcycleCounts&           cyc_sum,
+                     const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
+                     const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 {
     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
     double      c2t, c2t_pp, c2t_pme = 0;
-    int         i, j, npp, nth_tot;
+    int         npp, nth_tot;
     char        buf[STRLEN];
-    const char *hline = "-----------------------------------------------------------------------------";
+    const char* hline =
+            "-----------------------------------------------------------------------------";
 
     if (wc == nullptr)
     {
@@ -752,16 +702,16 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
     GMX_ASSERT(nth_pme > 0, "Number of PME threads must be >0");
     GMX_ASSERT(nnodes > 0, "Number of nodes must be >0");
     GMX_ASSERT(npme >= 0, "Number of PME nodes cannot be negative");
-    npp     = nnodes - npme;
+    npp = nnodes - npme;
     /* npme is the number of PME-only ranks used, and we always do PP work */
     GMX_ASSERT(npp > 0, "Number of particle-particle nodes must be >0");
 
-    nth_tot = npp*nth_pp + npme*nth_pme;
+    nth_tot = npp * nth_pp + npme * nth_pme;
 
     /* When using PME-only nodes, the next line is valid for both
        PP-only and PME-only nodes because they started ewcRUN at the
        same time. */
-    tot        = cyc_sum[ewcRUN];
+    tot        = cyc_sum[static_cast<int>(WallCycleCounter::Run)];
     tot_for_pp = 0;
 
     if (tot <= 0.0)
@@ -770,28 +720,36 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
            code so that it is provably robust with respect to
            non-positive values for all possible timer and cycle
            counters, there is less value gained from printing whatever
-           timing data might still be sensible for some non-Jenkins
-           run, than is lost from diagnosing Jenkins FP exceptions on
+           timing data might still be sensible for some non-CI
+           run, than is lost from diagnosing CI FP exceptions on
            runs about whose execution time we don't care. */
-        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a time accounting",
-                tot);
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendTextFormatted(
+                        "WARNING: A total of %f CPU cycles was recorded, so mdrun cannot print a "
+                        "time accounting",
+                        tot);
         return;
     }
 
     if (wc->haveInvalidCount)
     {
-        GMX_LOG(mdlog.warning).asParagraph().appendText("NOTE: Detected invalid cycle counts, probably because threads moved between CPU cores that do not have synchronized cycle counters. Will not print the cycle accounting.");
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendText(
+                        "NOTE: Detected invalid cycle counts, probably because threads moved "
+                        "between CPU cores that do not have synchronized cycle counters. Will not "
+                        "print the cycle accounting.");
         return;
     }
 
 
     /* Conversion factor from cycles to seconds */
-    c2t     = realtime/tot;
-    c2t_pp  = c2t * nth_tot / static_cast<double>(npp*nth_pp);
+    c2t    = realtime / tot;
+    c2t_pp = c2t * nth_tot / static_cast<double>(npp * nth_pp);
     if (npme > 0)
     {
-        c2t_pme = c2t * nth_tot / static_cast<double>(npme*nth_pme);
+        c2t_pme = c2t * nth_tot / static_cast<double>(npme * nth_pme);
     }
     else
     {
@@ -803,57 +761,71 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
     print_header(fplog, npp, nth_pp, npme, nth_pme);
 
     fprintf(fplog, "%s\n", hline);
-    for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
+    gmx::EnumerationWrapper<WallCycleCounter> iter;
+    for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
+         key != iter.end();
+         ++key)
     {
-        if (is_pme_subcounter(i))
+
+        if (is_pme_subcounter(*key))
         {
             /* Do not count these at all */
         }
-        else if (npme > 0 && is_pme_counter(i))
+        else if (npme > 0 && is_pme_counter(*key))
         {
             /* Print timing information for PME-only nodes, but add an
              * asterisk so the reader of the table can know that the
              * walltimes are not meant to add up. The asterisk still
              * fits in the required maximum of 19 characters. */
-            char buffer[STRLEN];
-            snprintf(buffer, STRLEN, "%s *", wcn[i]);
-            print_cycles(fplog, c2t_pme, buffer,
-                         npme, nth_pme,
-                         wc->wcc[i].n, cyc_sum[i], tot);
+            std::string message = gmx::formatString("%s *", enumValuetoString(*key));
+            print_cycles(fplog,
+                         c2t_pme,
+                         message.c_str(),
+                         npme,
+                         nth_pme,
+                         wc->wcc[*key].n,
+                         cyc_sum[static_cast<int>(*key)],
+                         tot);
         }
         else
         {
             /* Print timing information when it is for a PP or PP+PME
                node */
-            print_cycles(fplog, c2t_pp, wcn[i],
-                         npp, nth_pp,
-                         wc->wcc[i].n, cyc_sum[i], tot);
-            tot_for_pp += cyc_sum[i];
+            print_cycles(fplog,
+                         c2t_pp,
+                         enumValuetoString(*key),
+                         npp,
+                         nth_pp,
+                         wc->wcc[*key].n,
+                         cyc_sum[static_cast<int>(*key)],
+                         tot);
+            tot_for_pp += cyc_sum[static_cast<int>(*key)];
         }
     }
-    if (wc->wcc_all != nullptr)
+    if (!wc->wcc_all.empty())
     {
-        for (i = 0; i < ewcNR; i++)
+        for (auto i : keysOf(wc->wcc))
         {
-            for (j = 0; j < ewcNR; j++)
+            const int countI = static_cast<int>(i);
+            for (auto j : keysOf(wc->wcc))
             {
-                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
-                print_cycles(fplog, c2t_pp, buf,
-                             npp, nth_pp,
-                             wc->wcc_all[i*ewcNR+j].n,
-                             wc->wcc_all[i*ewcNR+j].c,
+                const int countJ = static_cast<int>(j);
+                snprintf(buf, 20, "%-9.9s %-9.9s", enumValuetoString(i), enumValuetoString(j));
+                print_cycles(fplog,
+                             c2t_pp,
+                             buf,
+                             npp,
+                             nth_pp,
+                             wc->wcc_all[countI * sc_numWallCycleCounters + countJ].n,
+                             wc->wcc_all[countI * sc_numWallCycleCounters + countJ].c,
                              tot);
             }
         }
     }
     tot_for_rest = tot * npp * nth_pp / static_cast<double>(nth_tot);
-    print_cycles(fplog, c2t_pp, "Rest",
-                 npp, nth_pp,
-                 -1, tot_for_rest - tot_for_pp, tot);
+    print_cycles(fplog, c2t_pp, "Rest", npp, nth_pp, -1, tot_for_rest - tot_for_pp, tot);
     fprintf(fplog, "%s\n", hline);
-    print_cycles(fplog, c2t, "Total",
-                 npp, nth_pp,
-                 -1, tot, tot);
+    print_cycles(fplog, c2t, "Total", npp, nth_pp, -1, tot, tot);
     fprintf(fplog, "%s\n", hline);
 
     if (npme > 0)
@@ -861,19 +833,22 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
         fprintf(fplog,
                 "(*) Note that with separate PME ranks, the walltime column actually sums to\n"
                 "    twice the total reported, but the cycle count total and %% are correct.\n"
-                "%s\n", hline);
+                "%s\n",
+                hline);
     }
 
-    if (wc->wcc[ewcPMEMESH].n > 0)
+    if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
     {
         // A workaround to not print breakdown when no subcounters were recorded.
         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
-        std::vector<int> validPmeSubcounterIndices;
-        for (i = ewcPPDURINGPME+1; i < ewcNR; i++)
+        std::vector<WallCycleCounter> validPmeSubcounterIndices;
+        for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
+             key != iter.end();
+             key++)
         {
-            if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
+            if (is_pme_subcounter(*key) && wc->wcc[*key].n > 0)
             {
-                validPmeSubcounterIndices.push_back(i);
+                validPmeSubcounterIndices.push_back(*key);
             }
         }
 
@@ -883,23 +858,33 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
             fprintf(fplog, "%s\n", hline);
             for (auto i : validPmeSubcounterIndices)
             {
-                print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i],
-                             npme > 0 ? npme : npp, nth_pme,
-                             wc->wcc[i].n, cyc_sum[i], tot);
+                print_cycles(fplog,
+                             npme > 0 ? c2t_pme : c2t_pp,
+                             enumValuetoString(i),
+                             npme > 0 ? npme : npp,
+                             nth_pme,
+                             wc->wcc[i].n,
+                             cyc_sum[static_cast<int>(i)],
+                             tot);
             }
             fprintf(fplog, "%s\n", hline);
         }
     }
 
-    if (useCycleSubcounters && wc->wcsc)
+    if (sc_useCycleSubcounters)
     {
         fprintf(fplog, " Breakdown of PP computation\n");
         fprintf(fplog, "%s\n", hline);
-        for (i = 0; i < ewcsNR; i++)
+        for (auto key : keysOf(wc->wcsc))
         {
-            print_cycles(fplog, c2t_pp, wcsn[i],
-                         npp, nth_pp,
-                         wc->wcsc[i].n, cyc_sum[ewcNR+i], tot);
+            print_cycles(fplog,
+                         c2t_pp,
+                         enumValuetoString(key),
+                         npp,
+                         nth_pp,
+                         wc->wcsc[key].n,
+                         cyc_sum[sc_numWallCycleCounters + static_cast<int>(key)],
+                         tot);
         }
         fprintf(fplog, "%s\n", hline);
     }
@@ -908,64 +893,66 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
     double tot_gpu = 0.0;
     if (gpu_pme_t)
     {
-        for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+        for (auto key : keysOf(gpu_pme_t->timing))
         {
-            tot_gpu += gpu_pme_t->timing[k].t;
+            tot_gpu += gpu_pme_t->timing[key].t;
         }
     }
     if (gpu_nbnxn_t)
     {
-        const char *k_log_str[2][2] = {
-            {"Nonbonded F kernel", "Nonbonded F+ene k."},
-            {"Nonbonded F+prune k.", "Nonbonded F+ene+prune k."}
-        };
+        const char* k_log_str[2][2] = { { "Nonbonded F kernel", "Nonbonded F+ene k." },
+                                        { "Nonbonded F+prune k.", "Nonbonded F+ene+prune k." } };
         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 
         /* add up the kernel timings */
-        for (i = 0; i < 2; i++)
+        for (int i = 0; i < 2; i++)
         {
-            for (j = 0; j < 2; j++)
+            for (int j = 0; j < 2; j++)
             {
                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
             }
         }
         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 
-        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
-        if (wc->wcc[ewcPMEMESH].n > 0)
+        tot_cpu_overlap = wc->wcc[WallCycleCounter::Force].c;
+        if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
         {
-            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
+            tot_cpu_overlap += wc->wcc[WallCycleCounter::PmeMesh].c;
         }
-        tot_cpu_overlap *= realtime*1000/tot; /* convert s to ms */
+        tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 
         fprintf(fplog, "\n GPU timings\n%s\n", hline);
-        fprintf(fplog, " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
+        fprintf(fplog,
+                " Computing:                         Count  Wall t (s)      ms/step       %c\n",
+                '%');
         fprintf(fplog, "%s\n", hline);
-        print_gputimes(fplog, "Pair list H2D",
-                       gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
-        print_gputimes(fplog, "X / q H2D",
-                       gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
+        print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
+        print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 
-        for (i = 0; i < 2; i++)
+        for (int i = 0; i < 2; i++)
         {
-            for (j = 0; j < 2; j++)
+            for (int j = 0; j < 2; j++)
             {
                 if (gpu_nbnxn_t->ktime[i][j].c)
                 {
-                    print_gputimes(fplog, k_log_str[i][j],
-                                   gpu_nbnxn_t->ktime[i][j].c, gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
+                    print_gputimes(fplog,
+                                   k_log_str[i][j],
+                                   gpu_nbnxn_t->ktime[i][j].c,
+                                   gpu_nbnxn_t->ktime[i][j].t,
+                                   tot_gpu);
                 }
             }
         }
         if (gpu_pme_t)
         {
-            for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+            for (auto key : keysOf(gpu_pme_t->timing))
             {
-                if (gpu_pme_t->timing[k].c)
+                if (gpu_pme_t->timing[key].c)
                 {
-                    print_gputimes(fplog, PMEStageNames[k],
-                                   gpu_pme_t->timing[k].c,
-                                   gpu_pme_t->timing[k].t,
+                    print_gputimes(fplog,
+                                   enumValuetoString(key),
+                                   gpu_pme_t->timing[key].c,
+                                   gpu_pme_t->timing[key].t,
                                    tot_gpu);
                 }
             }
@@ -974,7 +961,7 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
         {
             print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
         }
-        print_gputimes(fplog, "F D2H",  gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
+        print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
         fprintf(fplog, "%s\n", hline);
         print_gputimes(fplog, "Total ", gpu_nbnxn_t->nb_c, tot_gpu, tot_gpu);
         fprintf(fplog, "%s\n", hline);
@@ -984,19 +971,26 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
              * and avoid adding it to tot_gpu as this is not in the force
              * overlap. We print the fraction as relative to the rest.
              */
-            print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c, gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
+            print_gputimes(fplog,
+                           "*Dynamic pruning",
+                           gpu_nbnxn_t->dynamicPruneTime.c,
+                           gpu_nbnxn_t->dynamicPruneTime.t,
+                           tot_gpu);
             fprintf(fplog, "%s\n", hline);
         }
-        gpu_cpu_ratio = tot_gpu/tot_cpu_overlap;
-        if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
+        gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
+        if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[WallCycleCounter::Force].n > 0)
         {
-            fprintf(fplog, "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = %.3f\n",
-                    tot_gpu/gpu_nbnxn_t->nb_c, tot_cpu_overlap/wc->wcc[ewcFORCE].n,
+            fprintf(fplog,
+                    "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
+                    "%.3f\n",
+                    tot_gpu / gpu_nbnxn_t->nb_c,
+                    tot_cpu_overlap / wc->wcc[WallCycleCounter::Force].n,
                     gpu_cpu_ratio);
         }
 
         /* only print notes related to CPU-GPU load balance with PME */
-        if (wc->wcc[ewcPMEMESH].n > 0)
+        if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
         {
             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
 
@@ -1012,26 +1006,37 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
                         /* The user could have used -notunepme,
                          * but we currently can't check that here.
                          */
-                        GMX_LOG(mdlog.warning).asParagraph().appendText(
-                                "NOTE: The CPU has >25% more load than the GPU. This imbalance wastes\n"
-                                "      GPU resources. Maybe the domain decomposition limits the PME tuning.\n"
-                                "      In that case, try setting the DD grid manually (-dd) or lowering -dds.");
+                        GMX_LOG(mdlog.warning)
+                                .asParagraph()
+                                .appendText(
+                                        "NOTE: The CPU has >25% more load than the GPU. This "
+                                        "imbalance wastes\n"
+                                        "      GPU resources. Maybe the domain decomposition "
+                                        "limits the PME tuning.\n"
+                                        "      In that case, try setting the DD grid manually "
+                                        "(-dd) or lowering -dds.");
                     }
                     else
                     {
                         /* We should not end up here, unless the box is
                          * too small for increasing the cut-off for PME tuning.
                          */
-                        GMX_LOG(mdlog.warning).asParagraph().appendText(
-                                "NOTE: The CPU has >25% more load than the GPU. This imbalance wastes\n"
-                                "      GPU resources.");
+                        GMX_LOG(mdlog.warning)
+                                .asParagraph()
+                                .appendText(
+                                        "NOTE: The CPU has >25% more load than the GPU. This "
+                                        "imbalance wastes\n"
+                                        "      GPU resources.");
                     }
                 }
                 if (gpu_cpu_ratio > 1.25)
                 {
-                    GMX_LOG(mdlog.warning).asParagraph().appendText(
-                            "NOTE: The GPU has >25% more load than the CPU. This imbalance wastes\n"
-                            "      CPU resources.");
+                    GMX_LOG(mdlog.warning)
+                            .asParagraph()
+                            .appendText(
+                                    "NOTE: The GPU has >25% more load than the CPU. This imbalance "
+                                    "wastes\n"
+                                    "      CPU resources.");
                 }
             }
         }
@@ -1039,85 +1044,67 @@ void wallcycle_print(FILE *fplog, const gmx::MDLogger &mdlog, int nnodes, int np
 
     if (wc->wc_barrier)
     {
-        GMX_LOG(mdlog.warning).asParagraph().appendText(
-                "MPI_Barrier was called before each cycle start/stop\n"
-                "call, so timings are not those of real runs.");
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendText(
+                        "MPI_Barrier was called before each cycle start/stop\n"
+                        "call, so timings are not those of real runs.");
     }
 
-    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 &&
-        (cyc_sum[ewcDOMDEC] > tot*0.1 ||
-         cyc_sum[ewcNS] > tot*0.1))
+    if (wc->wcc[WallCycleCounter::NbXFBufOps].n > 0
+        && (cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] > tot * 0.1
+            || cyc_sum[static_cast<int>(WallCycleCounter::NS)] > tot * 0.1))
     {
         /* Only the sim master calls this function, so always print to stderr */
-        if (wc->wcc[ewcDOMDEC].n == 0)
+        if (wc->wcc[WallCycleCounter::Domdec].n == 0)
         {
-            GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                    "NOTE: %d %% of the run time was spent in pair search,\n"
-                    "      you might want to increase nstlist (this has no effect on accuracy)\n",
-                    (int)(100*cyc_sum[ewcNS]/tot+0.5));
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted(
+                            "NOTE: %d %% of the run time was spent in pair search,\n"
+                            "      you might want to increase nstlist (this has no effect on "
+                            "accuracy)\n",
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
         }
         else
         {
-            GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                    "NOTE: %d %% of the run time was spent in domain decomposition,\n"
-                    "      %d %% of the run time was spent in pair search,\n"
-                    "      you might want to increase nstlist (this has no effect on accuracy)\n",
-                    (int)(100*cyc_sum[ewcDOMDEC]/tot+0.5),
-                    (int)(100*cyc_sum[ewcNS]/tot+0.5));
+            GMX_LOG(mdlog.warning)
+                    .asParagraph()
+                    .appendTextFormatted(
+                            "NOTE: %d %% of the run time was spent in domain decomposition,\n"
+                            "      %d %% of the run time was spent in pair search,\n"
+                            "      you might want to increase nstlist (this has no effect on "
+                            "accuracy)\n",
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] / tot),
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
         }
     }
 
-    if (cyc_sum[ewcMoveE] > tot*0.05)
+    if (cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] > tot * 0.05)
     {
-        GMX_LOG(mdlog.warning).asParagraph().appendTextFormatted(
-                "NOTE: %d %% of the run time was spent communicating energies,\n"
-                "      you might want to use the -gcom option of mdrun\n",
-                (int)(100*cyc_sum[ewcMoveE]/tot+0.5));
+        GMX_LOG(mdlog.warning)
+                .asParagraph()
+                .appendTextFormatted(
+                        "NOTE: %d %% of the run time was spent communicating energies,\n"
+                        "      you might want to increase some nst* mdp options\n",
+                        gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] / tot));
     }
 }
 
-extern gmx_int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
+int64_t wcycle_get_reset_counters(gmx_wallcycle* wc)
 {
     if (wc == nullptr)
     {
         return -1;
     }
-
     return wc->reset_counters;
 }
 
-extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, gmx_int64_t reset_counters)
+void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters)
 {
     if (wc == nullptr)
     {
         return;
     }
-
     wc->reset_counters = reset_counters;
 }
-
-void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
-{
-    if (useCycleSubcounters && wc != nullptr)
-    {
-        wc->wcsc[ewcs].start = gmx_cycles_read();
-    }
-}
-
-void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
-{
-    if (useCycleSubcounters && wc != nullptr)
-    {
-        wallcycle_sub_start(wc, ewcs);
-        wc->wcsc[ewcs].n--;
-    }
-}
-
-void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
-{
-    if (useCycleSubcounters && wc != nullptr)
-    {
-        wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
-        wc->wcsc[ewcs].n++;
-    }
-}