Remove/replace many mentions of Jenkins
[alexxy/gromacs.git] / src / gromacs / timing / wallcycle.cpp
index 45714a1da2cca611d7ea0709687ed15dae5aff0b..1f1d7ca38ea2eec6b6ac4f68f2505d530ed0e163 100644 (file)
@@ -4,7 +4,7 @@
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2008, The GROMACS development team.
  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -44,6 +44,7 @@
 #include <cstdlib>
 
 #include <array>
+#include <memory>
 #include <vector>
 
 #include "gromacs/math/functions.h"
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/timing/wallcyclereporting.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/logger.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/snprintf.h"
+#include "gromacs/utility/stringutil.h"
 
-static const bool useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
+//! Whether wallcycle debugging is enabled
+constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
+//! True if only the master rank should print debugging output
+constexpr bool gmx_unused onlyMasterDebugPrints = true;
+//! True if cycle counter nesting depth debuggin prints are enabled
+constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
 
-/* DEBUG_WCYCLE adds consistency checking for the counters.
- * It checks if you stop a counter different from the last
- * one that was opened and if you do nest too deep.
- */
-/* #define DEBUG_WCYCLE */
-
-#ifdef DEBUG_WCYCLE
+#if DEBUG_WCYCLE
 #    include "gromacs/utility/fatalerror.h"
 #endif
 
-typedef struct
+/* Each name should not exceed 19 printing characters
+   (ie. terminating null can be twentieth) */
+static const char* enumValuetoString(WallCycleCounter enumValue)
 {
-    int          n;
-    gmx_cycles_t c;
-    gmx_cycles_t start;
-} wallcc_t;
+    constexpr gmx::EnumerationArray<WallCycleCounter, const char*> wallCycleCounterNames = {
+        "Run",
+        "Step",
+        "PP during PME",
+        "Domain decomp.",
+        "DD comm. load",
+        "DD comm. bounds",
+        "Vsite constr.",
+        "Send X to PME",
+        "Neighbor search",
+        "Launch GPU ops.",
+        "Comm. coord.",
+        "Force",
+        "Wait + Comm. F",
+        "PME mesh",
+        "PME redist. X/F",
+        "PME spread",
+        "PME gather",
+        "PME 3D-FFT",
+        "PME 3D-FFT Comm.",
+        "PME solve LJ",
+        "PME solve Elec",
+        "PME wait for PP",
+        "Wait + Recv. PME F",
+        "Wait PME GPU spread",
+        "PME 3D-FFT",
+        "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
+        "Wait PME GPU gather",
+        "Wait Bonded GPU",
+        "Reduce GPU PME F",
+        "Wait GPU NB nonloc.",
+        "Wait GPU NB local",
+        "Wait GPU state copy",
+        "NB X/F buffer ops.",
+        "Vsite spread",
+        "COM pull force",
+        "AWH",
+        "Write traj.",
+        "Update",
+        "Constraints",
+        "Comm. energies",
+        "Enforced rotation",
+        "Add rot. forces",
+        "Position swapping",
+        "IMD",
+        "Test"
+    };
+    return wallCycleCounterNames[enumValue];
+}
 
-struct gmx_wallcycle
+static const char* enumValuetoString(WallCycleSubCounter enumValue)
 {
-    wallcc_t* wcc;
-    /* did we detect one or more invalid cycle counts */
-    gmx_bool haveInvalidCount;
-    /* variables for testing/debugging */
-    gmx_bool  wc_barrier;
-    wallcc_t* wcc_all;
-    int       wc_depth;
-#ifdef DEBUG_WCYCLE
-#    define DEPTH_MAX 6
-    int counterlist[DEPTH_MAX];
-    int count_depth;
-#endif
-    int          ewc_prev;
-    gmx_cycles_t cycle_prev;
-    int64_t      reset_counters;
-#if GMX_MPI
-    MPI_Comm mpi_comm_mygroup;
-#endif
-    wallcc_t* wcsc;
-};
-
-/* Each name should not exceed 19 printing characters
-   (ie. terminating null can be twentieth) */
-static const char* wcn[ewcNR] = { "Run",
-                                  "Step",
-                                  "PP during PME",
-                                  "Domain decomp.",
-                                  "DD comm. load",
-                                  "DD comm. bounds",
-                                  "Vsite constr.",
-                                  "Send X to PME",
-                                  "Neighbor search",
-                                  "Launch GPU ops.",
-                                  "Comm. coord.",
-                                  "Force",
-                                  "Wait + Comm. F",
-                                  "PME mesh",
-                                  "PME redist. X/F",
-                                  "PME spread",
-                                  "PME gather",
-                                  "PME 3D-FFT",
-                                  "PME 3D-FFT Comm.",
-                                  "PME solve LJ",
-                                  "PME solve Elec",
-                                  "PME wait for PP",
-                                  "Wait + Recv. PME F",
-                                  "Wait PME GPU spread",
-                                  "PME 3D-FFT",
-                                  "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
-                                  "Wait PME GPU gather",
-                                  "Wait Bonded GPU",
-                                  "Reduce GPU PME F",
-                                  "Wait GPU NB nonloc.",
-                                  "Wait GPU NB local",
-                                  "Wait GPU state copy",
-                                  "NB X/F buffer ops.",
-                                  "Vsite spread",
-                                  "COM pull force",
-                                  "AWH",
-                                  "Write traj.",
-                                  "Update",
-                                  "Constraints",
-                                  "Comm. energies",
-                                  "Enforced rotation",
-                                  "Add rot. forces",
-                                  "Position swapping",
-                                  "IMD",
-                                  "Test" };
-
-static const char* wcsn[ewcsNR] = {
-    "DD redist.",
-    "DD NS grid + sort",
-    "DD setup comm.",
-    "DD make top.",
-    "DD make constr.",
-    "DD top. other",
-    "NS grid local",
-    "NS grid non-loc.",
-    "NS search local",
-    "NS search non-loc.",
-    "Bonded F",
-    "Bonded-FEP F",
-    "Restraints F",
-    "Listed buffer ops.",
-    "Nonbonded pruning",
-    "Nonbonded F kernel",
-    "Nonbonded F clear",
-    "Nonbonded FEP",
-    "Launch NB GPU tasks",
-    "Launch Bonded GPU tasks",
-    "Launch PME GPU tasks",
-    "Launch state copy",
-    "Ewald F correction",
-    "NB X buffer ops.",
-    "NB F buffer ops.",
-    "Clear force buffer",
-    "Launch GPU NB X buffer ops.",
-    "Launch GPU NB F buffer ops.",
-    "Launch GPU Comm. coord.",
-    "Launch GPU Comm. force.",
-    "Test subcounter",
-};
+    constexpr gmx::EnumerationArray<WallCycleSubCounter, const char*> wallCycleSubCounterNames = {
+        "DD redist.",
+        "DD NS grid + sort",
+        "DD setup comm.",
+        "DD make top.",
+        "DD make constr.",
+        "DD top. other",
+        "DD GPU ops.",
+        "NS grid local",
+        "NS grid non-loc.",
+        "NS search local",
+        "NS search non-loc.",
+        "Bonded F",
+        "Bonded-FEP F",
+        "Restraints F",
+        "Listed buffer ops.",
+        "Nonbonded pruning",
+        "Nonbonded F kernel",
+        "Nonbonded F clear",
+        "Nonbonded FEP",
+        "Nonbonded FEP reduction",
+        "Launch NB GPU tasks",
+        "Launch Bonded GPU tasks",
+        "Launch PME GPU tasks",
+        "Launch state copy",
+        "Ewald F correction",
+        "NB X buffer ops.",
+        "NB F buffer ops.",
+        "Clear force buffer",
+        "Launch GPU NB X buffer ops.",
+        "Launch GPU NB F buffer ops.",
+        "Launch GPU Comm. coord.",
+        "Launch GPU Comm. force.",
+        "Launch GPU update",
+        "Test subcounter"
+    };
+    return wallCycleSubCounterNames[enumValue];
+}
 
 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
-static const char* PMEStageNames[] = {
-    "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
-    "PME solve",  "PME 3D-FFT c2r", "PME gather",
+static const char* enumValuetoString(PmeStage enumValue)
+{
+    constexpr gmx::EnumerationArray<PmeStage, const char*> pmeStageNames = {
+        "PME spline", "PME spread",     "PME spline + spread", "PME 3D-FFT r2c",
+        "PME solve",  "PME 3D-FFT c2r", "PME gather"
+    };
+    return pmeStageNames[enumValue];
 };
 
-gmx_bool wallcycle_have_counter()
+bool wallcycle_have_counter()
 {
     return gmx_cycles_have_counter();
 }
 
-gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, t_commrec gmx_unused* cr)
+std::unique_ptr<gmx_wallcycle> wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
 {
-    gmx_wallcycle_t wc;
+    std::unique_ptr<gmx_wallcycle> wc;
 
 
     if (!wallcycle_have_counter())
     {
-        return nullptr;
+        return wc;
     }
 
-    snew(wc, 1);
+    wc = std::make_unique<gmx_wallcycle>();
 
-    wc->haveInvalidCount = FALSE;
-    wc->wc_barrier       = FALSE;
-    wc->wcc_all          = nullptr;
+    wc->haveInvalidCount = false;
+    wc->wc_barrier       = false;
     wc->wc_depth         = 0;
-    wc->ewc_prev         = -1;
+    wc->ewc_prev         = WallCycleCounter::Count;
     wc->reset_counters   = resetstep;
+    wc->cr               = cr;
+
 
 #if GMX_MPI
-    if (PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
+    if (cr != nullptr && PAR(cr) && getenv("GMX_CYCLE_BARRIER") != nullptr)
     {
         if (fplog)
         {
             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
         }
-        wc->wc_barrier       = TRUE;
-        wc->mpi_comm_mygroup = cr->mpi_comm_mygroup;
+        wc->wc_barrier = true;
     }
 #endif
 
-    snew(wc->wcc, ewcNR);
     if (getenv("GMX_CYCLE_ALL") != nullptr)
     {
         if (fplog)
         {
             fprintf(fplog, "\nWill time all the code during the run\n\n");
         }
-        snew(wc->wcc_all, ewcNR * ewcNR);
-    }
-
-    if (useCycleSubcounters)
-    {
-        snew(wc->wcsc, ewcsNR);
+        wc->wcc_all.resize(sc_numWallCycleCountersSquared);
     }
 
-#ifdef DEBUG_WCYCLE
-    wc->count_depth = 0;
+#if DEBUG_WCYCLE
+    wc->count_depth  = 0;
+    wc->isMasterRank = MASTER(cr);
 #endif
 
     return wc;
 }
 
-void wallcycle_destroy(gmx_wallcycle_t wc)
+#if DEBUG_WCYCLE
+static void debug_start_check(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
-    if (wc == nullptr)
+    if (wc->count_depth < 0 || wc->count_depth >= c_MaxWallCycleDepth)
     {
-        return;
+        gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
     }
+    wc->counterlist[wc->count_depth] = ewc;
+    wc->count_depth++;
 
-    if (wc->wcc != nullptr)
-    {
-        sfree(wc->wcc);
-    }
-    if (wc->wcc_all != nullptr)
-    {
-        sfree(wc->wcc_all);
-    }
-    if (wc->wcsc != nullptr)
+    if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
     {
-        sfree(wc->wcsc);
+        std::string indentStr(4 * wc->count_depth, ' ');
+        fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
     }
-    sfree(wc);
 }
 
-static void wallcycle_all_start(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
+static void debug_stop_check(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
-    wc->ewc_prev   = ewc;
-    wc->cycle_prev = cycle;
-}
-
-static void wallcycle_all_stop(gmx_wallcycle_t wc, int ewc, gmx_cycles_t cycle)
-{
-    wc->wcc_all[wc->ewc_prev * ewcNR + ewc].n += 1;
-    wc->wcc_all[wc->ewc_prev * ewcNR + ewc].c += cycle - wc->cycle_prev;
-}
-
-
-#ifdef DEBUG_WCYCLE
-static void debug_start_check(gmx_wallcycle_t wc, int ewc)
-{
-    /* fprintf(stderr,"wcycle_start depth %d, %s\n",wc->count_depth,wcn[ewc]); */
-
-    if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
+    if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
     {
-        gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth);
+        std::string indentStr(4 * wc->count_depth, ' ');
+        fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
     }
-    wc->counterlist[wc->count_depth] = ewc;
-    wc->count_depth++;
-}
 
-static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
-{
     wc->count_depth--;
 
-    /* fprintf(stderr,"wcycle_stop depth %d, %s\n",wc->count_depth,wcn[ewc]); */
-
     if (wc->count_depth < 0)
     {
-        gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc],
+        gmx_fatal(FARGS,
+                  "wallcycle counter depth out of range when stopping %s: %d",
+                  enumValuetoString(ewc),
                   wc->count_depth);
     }
     if (wc->counterlist[wc->count_depth] != ewc)
     {
-        gmx_fatal(FARGS, "wallcycle mismatch at stop, start %s, stop %s",
-                  wcn[wc->counterlist[wc->count_depth]], wcn[ewc]);
+        gmx_fatal(FARGS,
+                  "wallcycle mismatch at stop, start %s, stop %s",
+                  enumValuetoString(wc->counterlist[wc->count_depth]),
+                  enumValuetoString(ewc));
     }
 }
 #endif
 
-void wallcycle_start(gmx_wallcycle_t wc, int ewc)
-{
-    gmx_cycles_t cycle;
-
-    if (wc == nullptr)
-    {
-        return;
-    }
-
-#if GMX_MPI
-    if (wc->wc_barrier)
-    {
-        MPI_Barrier(wc->mpi_comm_mygroup);
-    }
-#endif
-
-#ifdef DEBUG_WCYCLE
-    debug_start_check(wc, ewc);
-#endif
-
-    cycle              = gmx_cycles_read();
-    wc->wcc[ewc].start = cycle;
-    if (wc->wcc_all != nullptr)
-    {
-        wc->wc_depth++;
-        if (ewc == ewcRUN)
-        {
-            wallcycle_all_start(wc, ewc, cycle);
-        }
-        else if (wc->wc_depth == 3)
-        {
-            wallcycle_all_stop(wc, ewc, cycle);
-        }
-    }
-}
-
-void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
+void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c)
 {
-    if (wc == nullptr)
-    {
-        return;
-    }
-    wc->wcc[ewc].n++;
-}
-
-void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
-{
-    if (wc == nullptr)
-    {
-        return;
-    }
-
-    wallcycle_start(wc, ewc);
-    wc->wcc[ewc].n--;
+    *n = wc->wcc[ewc].n;
+    *c = static_cast<double>(wc->wcc[ewc].c);
 }
 
-double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
+void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c)
 {
-    gmx_cycles_t cycle, last;
-
-    if (wc == nullptr)
-    {
-        return 0;
-    }
-
-#if GMX_MPI
-    if (wc->wc_barrier)
-    {
-        MPI_Barrier(wc->mpi_comm_mygroup);
-    }
-#endif
-
-#ifdef DEBUG_WCYCLE
-    debug_stop_check(wc, ewc);
-#endif
-
-    /* When processes or threads migrate between cores, the cycle counting
-     * can get messed up if the cycle counter on different cores are not
-     * synchronized. When this happens we expect both large negative and
-     * positive cycle differences. We can detect negative cycle differences.
-     * Detecting too large positive counts if difficult, since count can be
-     * large, especially for ewcRUN. If we detect a negative count,
-     * we will not print the cycle accounting table.
-     */
-    cycle = gmx_cycles_read();
-    if (cycle >= wc->wcc[ewc].start)
+    if (sc_useCycleSubcounters && wc != nullptr)
     {
-        last = cycle - wc->wcc[ewc].start;
+        *n = wc->wcsc[ewcs].n;
+        *c = static_cast<double>(wc->wcsc[ewcs].c);
     }
-    else
-    {
-        last                 = 0;
-        wc->haveInvalidCount = TRUE;
-    }
-    wc->wcc[ewc].c += last;
-    wc->wcc[ewc].n++;
-    if (wc->wcc_all)
-    {
-        wc->wc_depth--;
-        if (ewc == ewcRUN)
-        {
-            wallcycle_all_stop(wc, ewc, cycle);
-        }
-        else if (wc->wc_depth == 2)
-        {
-            wallcycle_all_start(wc, ewc, cycle);
-        }
-    }
-
-    return last;
 }
 
-void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
+void wallcycle_reset_all(gmx_wallcycle* wc)
 {
-    *n = wc->wcc[ewc].n;
-    *c = static_cast<double>(wc->wcc[ewc].c);
-}
-
-void wallcycle_reset_all(gmx_wallcycle_t wc)
-{
-    int i;
-
     if (wc == nullptr)
     {
         return;
     }
 
-    for (i = 0; i < ewcNR; i++)
+    for (auto& counter : wc->wcc)
     {
-        wc->wcc[i].n = 0;
-        wc->wcc[i].c = 0;
+        counter.n = 0;
+        counter.c = 0;
     }
-    wc->haveInvalidCount = FALSE;
+    wc->haveInvalidCount = false;
 
-    if (wc->wcc_all)
+    if (!wc->wcc_all.empty())
     {
-        for (i = 0; i < ewcNR * ewcNR; i++)
+        for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
         {
             wc->wcc_all[i].n = 0;
             wc->wcc_all[i].c = 0;
         }
     }
-    if (wc->wcsc)
+    for (auto& counter : wc->wcsc)
     {
-        for (i = 0; i < ewcsNR; i++)
-        {
-            wc->wcsc[i].n = 0;
-            wc->wcsc[i].c = 0;
-        }
+        counter.n = 0;
+        counter.c = 0;
     }
 }
 
-static gmx_bool is_pme_counter(int ewc)
+static bool is_pme_counter(WallCycleCounter ewc)
 {
-    return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
+    return (ewc >= WallCycleCounter::PmeMesh && ewc <= WallCycleCounter::PmeWaitComm);
 }
 
-static gmx_bool is_pme_subcounter(int ewc)
+static bool is_pme_subcounter(WallCycleCounter ewc)
 {
-    return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
+    return (ewc >= WallCycleCounter::PmeRedistXF && ewc < WallCycleCounter::PmeWaitComm);
+}
+
+void wallcycleBarrier(gmx_wallcycle* wc)
+{
+#if GMX_MPI
+    if (wc->wc_barrier)
+    {
+        MPI_Barrier(wc->cr->mpi_comm_mygroup);
+    }
+#else
+    GMX_UNUSED_VALUE(wc);
+#endif
 }
 
 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
-static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
+// NOLINTNEXTLINE(google-runtime-references)
+static void subtract_cycles(gmx::EnumerationArray<WallCycleCounter, wallcc_t>& wcc,
+                            WallCycleCounter                                   ewc_main,
+                            WallCycleCounter                                   ewc_sub)
 {
     if (wcc[ewc_sub].n > 0)
     {
@@ -492,45 +361,47 @@ static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
     }
 }
 
-void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
+void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 {
     if (wc == nullptr)
     {
         return;
     }
 
-    for (int i = 0; i < ewcNR; i++)
+    for (auto key : keysOf(wc->wcc))
     {
-        if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
+        if (is_pme_counter(key) || (key == WallCycleCounter::Run && isPmeRank))
         {
-            wc->wcc[i].c *= nthreads_pme;
+            wc->wcc[key].c *= nthreads_pme;
 
-            if (wc->wcc_all)
+            if (!wc->wcc_all.empty())
             {
-                for (int j = 0; j < ewcNR; j++)
+                const int current = static_cast<int>(key);
+                for (int j = 0; j < sc_numWallCycleCounters; j++)
                 {
-                    wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
+                    wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pme;
                 }
             }
         }
         else
         {
-            wc->wcc[i].c *= nthreads_pp;
+            wc->wcc[key].c *= nthreads_pp;
 
-            if (wc->wcc_all)
+            if (!wc->wcc_all.empty())
             {
-                for (int j = 0; j < ewcNR; j++)
+                const int current = static_cast<int>(key);
+                for (int j = 0; j < sc_numWallCycleCounters; j++)
                 {
-                    wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
+                    wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pp;
                 }
             }
         }
     }
-    if (useCycleSubcounters && wc->wcsc && !isPmeRank)
+    if (sc_useCycleSubcounters && !isPmeRank)
     {
-        for (int i = 0; i < ewcsNR; i++)
+        for (auto& counter : wc->wcsc)
         {
-            wc->wcsc[i].c *= nthreads_pp;
+            counter.c *= nthreads_pp;
         }
     }
 }
@@ -547,124 +418,146 @@ void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthr
  * wcc_all are unused by the GPU reporting, but it is not satisfactory
  * for the future. Also, there's no need for MPI_Allreduce, since
  * only MASTERRANK uses any of the results. */
-WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
+WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc)
 {
-    WallcycleCounts cycles_sum;
-    wallcc_t*       wcc;
-    double          cycles[int(ewcNR) + int(ewcsNR)];
+    WallcycleCounts                                    cycles_sum;
+    gmx::EnumerationArray<WallCycleCounter, double>    cyclesMain;
+    gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSub;
 #if GMX_MPI
-    double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
+    gmx::EnumerationArray<WallCycleCounter, double>    cyclesMainOnNode;
+    gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSubOnNode;
 #endif
-    int i;
-    int nsum;
 
     if (wc == nullptr)
     {
         /* Default construction of std::array of non-class T can leave
-           the values indeterminate, just like a C array, and icc
-           warns about it. */
+           the values indeterminate, just like a C array */
         cycles_sum.fill(0);
         return cycles_sum;
     }
 
-    wcc = wc->wcc;
+    auto& wcc = wc->wcc;
 
-    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
-    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
+    subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommLoad);
+    subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommBound);
 
-    subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
+    subtract_cycles(wcc, WallCycleCounter::PmeFft, WallCycleCounter::PmeFftComm);
 
     if (cr->npmenodes == 0)
     {
         /* All nodes do PME (or no PME at all) */
-        subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
+        subtract_cycles(wcc, WallCycleCounter::Force, WallCycleCounter::PmeMesh);
     }
     else
     {
         /* The are PME-only nodes */
-        if (wcc[ewcPMEMESH].n > 0)
+        if (wcc[WallCycleCounter::PmeMesh].n > 0)
         {
             /* This must be a PME only node, calculate the Wait + Comm. time */
-            GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
+            GMX_ASSERT(wcc[WallCycleCounter::Run].c >= wcc[WallCycleCounter::PmeMesh].c,
                        "Total run ticks must be greater than PME-only ticks");
-            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
+            wcc[WallCycleCounter::PmeWaitComm].c =
+                    wcc[WallCycleCounter::Run].c - wcc[WallCycleCounter::PmeMesh].c;
         }
     }
 
     /* Store the cycles in a double buffer for summing */
-    for (i = 0; i < ewcNR; i++)
+    for (auto key : keysOf(wcc))
     {
 #if GMX_MPI
-        cycles_n[i] = static_cast<double>(wcc[i].n);
+        cyclesMainOnNode[key] = static_cast<double>(wcc[key].n);
 #endif
-        cycles[i] = static_cast<double>(wcc[i].c);
+        cyclesMain[key] = static_cast<double>(wcc[key].c);
     }
-    nsum = ewcNR;
-    if (wc->wcsc)
+    if (sc_useCycleSubcounters)
     {
-        for (i = 0; i < ewcsNR; i++)
+        for (auto key : keysOf(wc->wcsc))
         {
 #if GMX_MPI
-            cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
+            cyclesSubOnNode[key] = static_cast<double>(wc->wcsc[key].n);
 #endif
-            cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
+            cyclesSub[key] = static_cast<double>(wc->wcsc[key].c);
         }
-        nsum += ewcsNR;
     }
 
 #if GMX_MPI
     if (cr->nnodes > 1)
     {
-        double buf[int(ewcNR) + int(ewcsNR) + 1];
+        gmx::EnumerationArray<WallCycleCounter, double>    bufMain;
+        gmx::EnumerationArray<WallCycleSubCounter, double> bufSub;
 
         // TODO this code is used only at the end of the run, so we
         // can just do a simple reduce of haveInvalidCount in
         // wallcycle_print, and avoid bugs
-        cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
+        double haveInvalidCount = (wc->haveInvalidCount ? 1 : 0);
         // TODO Use MPI_Reduce
-        MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
-        for (i = 0; i < ewcNR; i++)
+        MPI_Allreduce(cyclesMainOnNode.data(), bufMain.data(), bufMain.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+        if (sc_useCycleSubcounters)
+        {
+            MPI_Allreduce(cyclesSubOnNode.data(), bufSub.data(), bufSub.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+        }
+        MPI_Allreduce(MPI_IN_PLACE, &haveInvalidCount, 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+        for (auto key : keysOf(wcc))
         {
-            wcc[i].n = gmx::roundToInt(buf[i]);
+            wcc[key].n = gmx::roundToInt(bufMain[key]);
         }
-        wc->haveInvalidCount = (buf[nsum] > 0);
-        if (wc->wcsc)
+        wc->haveInvalidCount = (haveInvalidCount > 0);
+        if (sc_useCycleSubcounters)
         {
-            for (i = 0; i < ewcsNR; i++)
+            for (auto key : keysOf(wc->wcsc))
             {
-                wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
+                wc->wcsc[key].n = gmx::roundToInt(bufSub[key]);
             }
         }
 
         // TODO Use MPI_Reduce
-        MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+        MPI_Allreduce(cyclesMain.data(), cycles_sum.data(), cyclesMain.size(), MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+        if (sc_useCycleSubcounters)
+        {
+            MPI_Allreduce(cyclesSub.data(),
+                          cycles_sum.data() + sc_numWallCycleCounters,
+                          cyclesSub.size(),
+                          MPI_DOUBLE,
+                          MPI_SUM,
+                          cr->mpi_comm_mysim);
+        }
 
-        if (wc->wcc_all != nullptr)
+        if (!wc->wcc_all.empty())
         {
-            double *buf_all, *cyc_all;
+            std::array<double, sc_numWallCycleCountersSquared> cyc_all;
+            std::array<double, sc_numWallCycleCountersSquared> buf_all;
 
-            snew(cyc_all, ewcNR * ewcNR);
-            snew(buf_all, ewcNR * ewcNR);
-            for (i = 0; i < ewcNR * ewcNR; i++)
+            for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
             {
                 cyc_all[i] = wc->wcc_all[i].c;
             }
             // TODO Use MPI_Reduce
-            MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
-            for (i = 0; i < ewcNR * ewcNR; i++)
+            MPI_Allreduce(cyc_all.data(),
+                          buf_all.data(),
+                          sc_numWallCycleCountersSquared,
+                          MPI_DOUBLE,
+                          MPI_SUM,
+                          cr->mpi_comm_mysim);
+            for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
             {
                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
             }
-            sfree(buf_all);
-            sfree(cyc_all);
         }
     }
     else
 #endif
     {
-        for (i = 0; i < nsum; i++)
+        for (auto key : keysOf(cyclesMain))
         {
-            cycles_sum[i] = cycles[i];
+            cycles_sum[static_cast<int>(key)] = cyclesMain[key];
+        }
+        if (sc_useCycleSubcounters)
+        {
+            for (auto key : keysOf(cyclesSub))
+            {
+                const int offset   = static_cast<int>(key) + sc_numWallCycleCounters;
+                cycles_sum[offset] = cyclesSub[key];
+            }
         }
     }
 
@@ -711,8 +604,15 @@ print_cycles(FILE* fplog, double c2t, const char* name, int nnodes, int nthreads
         /* Convert the cycle count to wallclock time for this task */
         wallt = c_sum * c2t;
 
-        fprintf(fplog, " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n", name, nnodes_str,
-                nthreads_str, ncalls_str, wallt, c_sum * 1e-9, percentage);
+        fprintf(fplog,
+                " %-19.19s %4s %4s %10s  %10.3f %14.3f %5.1f\n",
+                name,
+                nnodes_str,
+                nthreads_str,
+                ncalls_str,
+                wallt,
+                c_sum * 1e-9,
+                percentage);
     }
 }
 
@@ -781,14 +681,14 @@ void wallcycle_print(FILE*                            fplog,
                      int                              nth_pp,
                      int                              nth_pme,
                      double                           realtime,
-                     gmx_wallcycle_t                  wc,
+                     gmx_wallcycle                  wc,
                      const WallcycleCounts&           cyc_sum,
                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 {
     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
     double      c2t, c2t_pp, c2t_pme = 0;
-    int         i, j, npp, nth_tot;
+    int         npp, nth_tot;
     char        buf[STRLEN];
     const char* hline =
             "-----------------------------------------------------------------------------";
@@ -811,7 +711,7 @@ void wallcycle_print(FILE*                            fplog,
     /* When using PME-only nodes, the next line is valid for both
        PP-only and PME-only nodes because they started ewcRUN at the
        same time. */
-    tot        = cyc_sum[ewcRUN];
+    tot        = cyc_sum[static_cast<int>(WallCycleCounter::Run)];
     tot_for_pp = 0;
 
     if (tot <= 0.0)
@@ -820,8 +720,8 @@ void wallcycle_print(FILE*                            fplog,
            code so that it is provably robust with respect to
            non-positive values for all possible timer and cycle
            counters, there is less value gained from printing whatever
-           timing data might still be sensible for some non-Jenkins
-           run, than is lost from diagnosing Jenkins FP exceptions on
+           timing data might still be sensible for some non-CI
+           run, than is lost from diagnosing CI FP exceptions on
            runs about whose execution time we don't care. */
         GMX_LOG(mdlog.warning)
                 .asParagraph()
@@ -861,39 +761,64 @@ void wallcycle_print(FILE*                            fplog,
     print_header(fplog, npp, nth_pp, npme, nth_pme);
 
     fprintf(fplog, "%s\n", hline);
-    for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
+    gmx::EnumerationWrapper<WallCycleCounter> iter;
+    for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
+         key != iter.end();
+         ++key)
     {
-        if (is_pme_subcounter(i))
+
+        if (is_pme_subcounter(*key))
         {
             /* Do not count these at all */
         }
-        else if (npme > 0 && is_pme_counter(i))
+        else if (npme > 0 && is_pme_counter(*key))
         {
             /* Print timing information for PME-only nodes, but add an
              * asterisk so the reader of the table can know that the
              * walltimes are not meant to add up. The asterisk still
              * fits in the required maximum of 19 characters. */
-            char buffer[STRLEN];
-            snprintf(buffer, STRLEN, "%s *", wcn[i]);
-            print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
+            std::string message = gmx::formatString("%s *", enumValuetoString(*key));
+            print_cycles(fplog,
+                         c2t_pme,
+                         message.c_str(),
+                         npme,
+                         nth_pme,
+                         wc->wcc[*key].n,
+                         cyc_sum[static_cast<int>(*key)],
+                         tot);
         }
         else
         {
             /* Print timing information when it is for a PP or PP+PME
                node */
-            print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
-            tot_for_pp += cyc_sum[i];
+            print_cycles(fplog,
+                         c2t_pp,
+                         enumValuetoString(*key),
+                         npp,
+                         nth_pp,
+                         wc->wcc[*key].n,
+                         cyc_sum[static_cast<int>(*key)],
+                         tot);
+            tot_for_pp += cyc_sum[static_cast<int>(*key)];
         }
     }
-    if (wc->wcc_all != nullptr)
+    if (!wc->wcc_all.empty())
     {
-        for (i = 0; i < ewcNR; i++)
+        for (auto i : keysOf(wc->wcc))
         {
-            for (j = 0; j < ewcNR; j++)
+            const int countI = static_cast<int>(i);
+            for (auto j : keysOf(wc->wcc))
             {
-                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
-                print_cycles(fplog, c2t_pp, buf, npp, nth_pp, wc->wcc_all[i * ewcNR + j].n,
-                             wc->wcc_all[i * ewcNR + j].c, tot);
+                const int countJ = static_cast<int>(j);
+                snprintf(buf, 20, "%-9.9s %-9.9s", enumValuetoString(i), enumValuetoString(j));
+                print_cycles(fplog,
+                             c2t_pp,
+                             buf,
+                             npp,
+                             nth_pp,
+                             wc->wcc_all[countI * sc_numWallCycleCounters + countJ].n,
+                             wc->wcc_all[countI * sc_numWallCycleCounters + countJ].c,
+                             tot);
             }
         }
     }
@@ -912,16 +837,18 @@ void wallcycle_print(FILE*                            fplog,
                 hline);
     }
 
-    if (wc->wcc[ewcPMEMESH].n > 0)
+    if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
     {
         // A workaround to not print breakdown when no subcounters were recorded.
         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
-        std::vector<int> validPmeSubcounterIndices;
-        for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
+        std::vector<WallCycleCounter> validPmeSubcounterIndices;
+        for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
+             key != iter.end();
+             key++)
         {
-            if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
+            if (is_pme_subcounter(*key) && wc->wcc[*key].n > 0)
             {
-                validPmeSubcounterIndices.push_back(i);
+                validPmeSubcounterIndices.push_back(*key);
             }
         }
 
@@ -931,20 +858,33 @@ void wallcycle_print(FILE*                            fplog,
             fprintf(fplog, "%s\n", hline);
             for (auto i : validPmeSubcounterIndices)
             {
-                print_cycles(fplog, npme > 0 ? c2t_pme : c2t_pp, wcn[i], npme > 0 ? npme : npp,
-                             nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
+                print_cycles(fplog,
+                             npme > 0 ? c2t_pme : c2t_pp,
+                             enumValuetoString(i),
+                             npme > 0 ? npme : npp,
+                             nth_pme,
+                             wc->wcc[i].n,
+                             cyc_sum[static_cast<int>(i)],
+                             tot);
             }
             fprintf(fplog, "%s\n", hline);
         }
     }
 
-    if (useCycleSubcounters && wc->wcsc)
+    if (sc_useCycleSubcounters)
     {
         fprintf(fplog, " Breakdown of PP computation\n");
         fprintf(fplog, "%s\n", hline);
-        for (i = 0; i < ewcsNR; i++)
+        for (auto key : keysOf(wc->wcsc))
         {
-            print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
+            print_cycles(fplog,
+                         c2t_pp,
+                         enumValuetoString(key),
+                         npp,
+                         nth_pp,
+                         wc->wcsc[key].n,
+                         cyc_sum[sc_numWallCycleCounters + static_cast<int>(key)],
+                         tot);
         }
         fprintf(fplog, "%s\n", hline);
     }
@@ -953,9 +893,9 @@ void wallcycle_print(FILE*                            fplog,
     double tot_gpu = 0.0;
     if (gpu_pme_t)
     {
-        for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+        for (auto key : keysOf(gpu_pme_t->timing))
         {
-            tot_gpu += gpu_pme_t->timing[k].t;
+            tot_gpu += gpu_pme_t->timing[key].t;
         }
     }
     if (gpu_nbnxn_t)
@@ -965,55 +905,61 @@ void wallcycle_print(FILE*                            fplog,
         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 
         /* add up the kernel timings */
-        for (i = 0; i < 2; i++)
+        for (int i = 0; i < 2; i++)
         {
-            for (j = 0; j < 2; j++)
+            for (int j = 0; j < 2; j++)
             {
                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
             }
         }
         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 
-        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
-        if (wc->wcc[ewcPMEMESH].n > 0)
+        tot_cpu_overlap = wc->wcc[WallCycleCounter::Force].c;
+        if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
         {
-            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
+            tot_cpu_overlap += wc->wcc[WallCycleCounter::PmeMesh].c;
         }
         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 
         fprintf(fplog, "\n GPU timings\n%s\n", hline);
         fprintf(fplog,
-                " Computing:                         Count  Wall t (s)      ms/step       %c\n", '%');
+                " Computing:                         Count  Wall t (s)      ms/step       %c\n",
+                '%');
         fprintf(fplog, "%s\n", hline);
         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 
-        for (i = 0; i < 2; i++)
+        for (int i = 0; i < 2; i++)
         {
-            for (j = 0; j < 2; j++)
+            for (int j = 0; j < 2; j++)
             {
                 if (gpu_nbnxn_t->ktime[i][j].c)
                 {
-                    print_gputimes(fplog, k_log_str[i][j], gpu_nbnxn_t->ktime[i][j].c,
-                                   gpu_nbnxn_t->ktime[i][j].t, tot_gpu);
+                    print_gputimes(fplog,
+                                   k_log_str[i][j],
+                                   gpu_nbnxn_t->ktime[i][j].c,
+                                   gpu_nbnxn_t->ktime[i][j].t,
+                                   tot_gpu);
                 }
             }
         }
         if (gpu_pme_t)
         {
-            for (size_t k = 0; k < gtPME_EVENT_COUNT; k++)
+            for (auto key : keysOf(gpu_pme_t->timing))
             {
-                if (gpu_pme_t->timing[k].c)
+                if (gpu_pme_t->timing[key].c)
                 {
-                    print_gputimes(fplog, PMEStageNames[k], gpu_pme_t->timing[k].c,
-                                   gpu_pme_t->timing[k].t, tot_gpu);
+                    print_gputimes(fplog,
+                                   enumValuetoString(key),
+                                   gpu_pme_t->timing[key].c,
+                                   gpu_pme_t->timing[key].t,
+                                   tot_gpu);
                 }
             }
         }
         if (gpu_nbnxn_t->pruneTime.c)
         {
-            print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c,
-                           gpu_nbnxn_t->pruneTime.t, tot_gpu);
+            print_gputimes(fplog, "Pruning kernel", gpu_nbnxn_t->pruneTime.c, gpu_nbnxn_t->pruneTime.t, tot_gpu);
         }
         print_gputimes(fplog, "F D2H", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_d2h_t, tot_gpu);
         fprintf(fplog, "%s\n", hline);
@@ -1025,21 +971,26 @@ void wallcycle_print(FILE*                            fplog,
              * and avoid adding it to tot_gpu as this is not in the force
              * overlap. We print the fraction as relative to the rest.
              */
-            print_gputimes(fplog, "*Dynamic pruning", gpu_nbnxn_t->dynamicPruneTime.c,
-                           gpu_nbnxn_t->dynamicPruneTime.t, tot_gpu);
+            print_gputimes(fplog,
+                           "*Dynamic pruning",
+                           gpu_nbnxn_t->dynamicPruneTime.c,
+                           gpu_nbnxn_t->dynamicPruneTime.t,
+                           tot_gpu);
             fprintf(fplog, "%s\n", hline);
         }
         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
-        if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
+        if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[WallCycleCounter::Force].n > 0)
         {
             fprintf(fplog,
                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
                     "%.3f\n",
-                    tot_gpu / gpu_nbnxn_t->nb_c, tot_cpu_overlap / wc->wcc[ewcFORCE].n, gpu_cpu_ratio);
+                    tot_gpu / gpu_nbnxn_t->nb_c,
+                    tot_cpu_overlap / wc->wcc[WallCycleCounter::Force].n,
+                    gpu_cpu_ratio);
         }
 
         /* only print notes related to CPU-GPU load balance with PME */
-        if (wc->wcc[ewcPMEMESH].n > 0)
+        if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
         {
             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
 
@@ -1100,10 +1051,12 @@ void wallcycle_print(FILE*                            fplog,
                         "call, so timings are not those of real runs.");
     }
 
-    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
+    if (wc->wcc[WallCycleCounter::NbXFBufOps].n > 0
+        && (cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] > tot * 0.1
+            || cyc_sum[static_cast<int>(WallCycleCounter::NS)] > tot * 0.1))
     {
         /* Only the sim master calls this function, so always print to stderr */
-        if (wc->wcc[ewcDOMDEC].n == 0)
+        if (wc->wcc[WallCycleCounter::Domdec].n == 0)
         {
             GMX_LOG(mdlog.warning)
                     .asParagraph()
@@ -1111,7 +1064,7 @@ void wallcycle_print(FILE*                            fplog,
                             "NOTE: %d %% of the run time was spent in pair search,\n"
                             "      you might want to increase nstlist (this has no effect on "
                             "accuracy)\n",
-                            gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
         }
         else
         {
@@ -1122,64 +1075,36 @@ void wallcycle_print(FILE*                            fplog,
                             "      %d %% of the run time was spent in pair search,\n"
                             "      you might want to increase nstlist (this has no effect on "
                             "accuracy)\n",
-                            gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
-                            gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] / tot),
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
         }
     }
 
-    if (cyc_sum[ewcMoveE] > tot * 0.05)
+    if (cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] > tot * 0.05)
     {
         GMX_LOG(mdlog.warning)
                 .asParagraph()
                 .appendTextFormatted(
                         "NOTE: %d %% of the run time was spent communicating energies,\n"
                         "      you might want to increase some nst* mdp options\n",
-                        gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
+                        gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] / tot));
     }
 }
 
-extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
+int64_t wcycle_get_reset_counters(gmx_wallcycle* wc)
 {
     if (wc == nullptr)
     {
         return -1;
     }
-
     return wc->reset_counters;
 }
 
-extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
+void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters)
 {
     if (wc == nullptr)
     {
         return;
     }
-
     wc->reset_counters = reset_counters;
 }
-
-void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
-{
-    if (useCycleSubcounters && wc != nullptr)
-    {
-        wc->wcsc[ewcs].start = gmx_cycles_read();
-    }
-}
-
-void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
-{
-    if (useCycleSubcounters && wc != nullptr)
-    {
-        wallcycle_sub_start(wc, ewcs);
-        wc->wcsc[ewcs].n--;
-    }
-}
-
-void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
-{
-    if (useCycleSubcounters && wc != nullptr)
-    {
-        wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
-        wc->wcsc[ewcs].n++;
-    }
-}