From 2fcd971786e3820dbd30e4dbc6e25ca546e4f27f Mon Sep 17 00:00:00 2001
From: Paul Bauer <paul.bauer.q@gmail.com>
Date: Wed, 14 Apr 2021 13:52:51 +0000
Subject: [PATCH] Modernize wallcycle counting

---
 api/nblib/gmxsetup.cpp                        |   2 +-
 src/gromacs/applied_forces/awh/awh.cpp        |   4 +-
 src/gromacs/domdec/cellsizes.cpp              |   8 +-
 src/gromacs/domdec/cellsizes.h                |   4 +-
 src/gromacs/domdec/domdec.cpp                 |   8 +-
 src/gromacs/domdec/gpuhaloexchange_impl.cu    |  32 +-
 src/gromacs/domdec/partition.cpp              |  36 +-
 src/gromacs/ewald/pme.cpp                     |  56 +-
 src/gromacs/ewald/pme_gpu.cpp                 |  78 +--
 src/gromacs/ewald/pme_load_balancing.cpp      |   4 +-
 src/gromacs/ewald/pme_load_balancing.h        |   6 +-
 src/gromacs/ewald/pme_only.cpp                |  12 +-
 src/gromacs/ewald/pme_pp.cpp                  |   4 +-
 src/gromacs/fft/fft5d.cpp                     |   4 +-
 src/gromacs/fft/fft5d.h                       |   4 +-
 src/gromacs/fft/parallel_3dfft.cpp            |   4 +-
 src/gromacs/fft/parallel_3dfft.h              |   4 +-
 src/gromacs/imd/imd.cpp                       |  12 +-
 src/gromacs/listed_forces/gpubonded_impl.cu   |  16 +-
 src/gromacs/listed_forces/gpubondedkernels.cu |   8 +-
 src/gromacs/listed_forces/listed_forces.cpp   |  16 +-
 .../listed_forces/position_restraints.cpp     |   4 +-
 src/gromacs/mdlib/constr.cpp                  |   4 +-
 src/gromacs/mdlib/force.cpp                   |  10 +-
 src/gromacs/mdlib/gpuforcereduction_impl.cu   |  12 +-
 src/gromacs/mdlib/md_support.cpp              |   6 +-
 src/gromacs/mdlib/md_support.h                |   2 +-
 src/gromacs/mdlib/mdoutf.cpp                  |  10 +-
 src/gromacs/mdlib/mdoutf.h                    |   4 +-
 src/gromacs/mdlib/resethandler.cpp            |  10 +-
 src/gromacs/mdlib/sim_util.cpp                | 156 ++---
 src/gromacs/mdlib/trajectory_writing.cpp      |   4 +-
 src/gromacs/mdlib/update.cpp                  |  20 +-
 src/gromacs/mdlib/update.h                    |   4 +-
 .../mdlib/update_constrain_gpu_impl.cu        |  32 +-
 src/gromacs/mdlib/update_vv.cpp               |  22 +-
 src/gromacs/mdlib/vsite.cpp                   |   4 +-
 src/gromacs/mdrun/md.cpp                      |  18 +-
 src/gromacs/mdrun/mimic.cpp                   |  10 +-
 src/gromacs/mdrun/minimize.cpp                |  18 +-
 src/gromacs/mdrun/rerun.cpp                   |   6 +-
 src/gromacs/mdrun/runner.cpp                  |  34 +-
 src/gromacs/mdrun/shellfc.cpp                 |   2 +-
 src/gromacs/mdrun/shellfc.h                   |   2 +-
 src/gromacs/mdrun/tpi.cpp                     |   2 +-
 .../state_propagator_data_gpu_impl_gpu.cpp    |  80 +--
 src/gromacs/modularsimulator/propagator.cpp   |  16 +-
 .../modularsimulator/simulatoralgorithm.cpp   |   6 +-
 .../modularsimulator/statepropagatordata.cpp  |  10 +-
 src/gromacs/nbnxm/gpu_common.h                |  10 +-
 src/gromacs/nbnxm/kerneldispatch.cpp          |  12 +-
 src/gromacs/nbnxm/nbnxm.cpp                   |  24 +-
 src/gromacs/nbnxm/prunekerneldispatch.cpp     |   8 +-
 src/gromacs/swap/swapcoords.cpp               |   4 +-
 src/gromacs/timing/tests/timing.cpp           |  44 +-
 src/gromacs/timing/wallcycle.cpp              | 551 ++++++++++--------
 src/gromacs/timing/wallcycle.h                | 296 +++++-----
 src/gromacs/timing/wallcyclereporting.h       |  10 +-
 58 files changed, 919 insertions(+), 870 deletions(-)

diff --git a/api/nblib/gmxsetup.cpp b/api/nblib/gmxsetup.cpp
index fd3cfd2a87..007e0cf3db 100644
--- a/api/nblib/gmxsetup.cpp
+++ b/api/nblib/gmxsetup.cpp
@@ -206,7 +206,7 @@ void NbvSetupUtil::setupNbnxmInstance(const size_t numParticleTypes, const NBKer
 
     // Put everything together
     auto nbv = std::make_unique<nonbonded_verlet_t>(
-            std::move(pairlistSets), std::move(pairSearch), std::move(atomData), kernelSetup, nullptr, nullWallcycle);
+            std::move(pairlistSets), std::move(pairSearch), std::move(atomData), kernelSetup, nullptr, nullptr);
 
     gmxForceCalculator_->nbv_ = std::move(nbv);
 }
diff --git a/src/gromacs/applied_forces/awh/awh.cpp b/src/gromacs/applied_forces/awh/awh.cpp
index 51948de02b..6f2a8db5ea 100644
--- a/src/gromacs/applied_forces/awh/awh.cpp
+++ b/src/gromacs/applied_forces/awh/awh.cpp
@@ -304,7 +304,7 @@ real Awh::applyBiasForcesAndUpdateBias(PbcType                pbcType,
         GMX_ASSERT(forceWithVirial, "Need a valid ForceWithVirial object");
     }
 
-    wallcycle_start(wallcycle, ewcAWH);
+    wallcycle_start(wallcycle, WallCycleCounter::Awh);
 
     t_pbc pbc;
     set_pbc(&pbc, pbcType, box);
@@ -394,7 +394,7 @@ real Awh::applyBiasForcesAndUpdateBias(PbcType                pbcType,
         }
     }
 
-    wallcycle_stop(wallcycle, ewcAWH);
+    wallcycle_stop(wallcycle, WallCycleCounter::Awh);
 
     return MASTER(commRecord_) ? static_cast<real>(awhPotential) : 0;
 }
diff --git a/src/gromacs/domdec/cellsizes.cpp b/src/gromacs/domdec/cellsizes.cpp
index 37813d8aba..ed1e6dbbcd 100644
--- a/src/gromacs/domdec/cellsizes.cpp
+++ b/src/gromacs/domdec/cellsizes.cpp
@@ -848,15 +848,15 @@ static void set_dd_cell_sizes_dlb(gmx_domdec_t*      dd,
                                   gmx_bool           bUniform,
                                   gmx_bool           bDoDLB,
                                   int64_t            step,
-                                  gmx_wallcycle_t    wcycle)
+                                  gmx_wallcycle*     wcycle)
 {
     gmx_domdec_comm_t* comm = dd->comm;
 
     if (bDoDLB)
     {
-        wallcycle_start(wcycle, ewcDDCOMMBOUND);
+        wallcycle_start(wcycle, WallCycleCounter::DDCommBound);
         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
-        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
+        wallcycle_stop(wcycle, WallCycleCounter::DDCommBound);
     }
     else if (bDynamicBox)
     {
@@ -885,7 +885,7 @@ void set_dd_cell_sizes(gmx_domdec_t*      dd,
                        gmx_bool           bUniform,
                        gmx_bool           bDoDLB,
                        int64_t            step,
-                       gmx_wallcycle_t    wcycle)
+                       gmx_wallcycle*     wcycle)
 {
     gmx_domdec_comm_t* comm = dd->comm;
 
diff --git a/src/gromacs/domdec/cellsizes.h b/src/gromacs/domdec/cellsizes.h
index cf07585e89..794eb27457 100644
--- a/src/gromacs/domdec/cellsizes.h
+++ b/src/gromacs/domdec/cellsizes.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -84,6 +84,6 @@ void set_dd_cell_sizes(gmx_domdec_t*      dd,
                        gmx_bool           bUniform,
                        gmx_bool           bDoDLB,
                        int64_t            step,
-                       gmx_wallcycle_t    wcycle);
+                       gmx_wallcycle*     wcycle);
 
 #endif
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index 088189b602..4f7cab7862 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -258,7 +258,7 @@ void dd_get_constraint_range(const gmx_domdec_t& dd, int* at_start, int* at_end)
 
 void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, gmx_wallcycle* wcycle)
 {
-    wallcycle_start(wcycle, ewcMOVEX);
+    wallcycle_start(wcycle, WallCycleCounter::MoveX);
 
     rvec shift = { 0, 0, 0 };
 
@@ -347,12 +347,12 @@ void dd_move_x(gmx_domdec_t* dd, const matrix box, gmx::ArrayRef<gmx::RVec> x, g
         nzone += nzone;
     }
 
-    wallcycle_stop(wcycle, ewcMOVEX);
+    wallcycle_stop(wcycle, WallCycleCounter::MoveX);
 }
 
 void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces, gmx_wallcycle* wcycle)
 {
-    wallcycle_start(wcycle, ewcMOVEF);
+    wallcycle_start(wcycle, WallCycleCounter::MoveF);
 
     gmx::ArrayRef<gmx::RVec> f      = forceWithShiftForces->force();
     gmx::ArrayRef<gmx::RVec> fshift = forceWithShiftForces->shiftForces();
@@ -456,7 +456,7 @@ void dd_move_f(gmx_domdec_t* dd, gmx::ForceWithShiftForces* forceWithShiftForces
         }
         nzone /= 2;
     }
-    wallcycle_stop(wcycle, ewcMOVEF);
+    wallcycle_stop(wcycle, WallCycleCounter::MoveF);
 }
 
 /* Convenience function for extracting a real buffer from an rvec buffer
diff --git a/src/gromacs/domdec/gpuhaloexchange_impl.cu b/src/gromacs/domdec/gpuhaloexchange_impl.cu
index 65af08d35d..f32a95bae7 100644
--- a/src/gromacs/domdec/gpuhaloexchange_impl.cu
+++ b/src/gromacs/domdec/gpuhaloexchange_impl.cu
@@ -134,8 +134,8 @@ __global__ void unpackRecvBufKernel(float3* __restrict__ data,
 
 void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_forcesBuffer)
 {
-    wallcycle_start(wcycle_, ewcDOMDEC);
-    wallcycle_sub_start(wcycle_, ewcsDD_GPU);
+    wallcycle_start(wcycle_, WallCycleCounter::Domdec);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::DDGpu);
 
     d_x_ = d_coordinatesBuffer;
     d_f_ = d_forcesBuffer;
@@ -249,8 +249,8 @@ void GpuHaloExchange::Impl::reinitHalo(float3* d_coordinatesBuffer, float3* d_fo
     }
 #endif
 
-    wallcycle_sub_stop(wcycle_, ewcsDD_GPU);
-    wallcycle_stop(wcycle_, ewcDOMDEC);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::DDGpu);
+    wallcycle_stop(wcycle_, WallCycleCounter::Domdec);
 
     return;
 }
@@ -283,14 +283,14 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
                                                        GpuEventSynchronizer* coordinatesReadyOnDeviceEvent)
 {
 
-    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu);
     if (pulse_ == 0)
     {
         // ensure stream waits until coordinate data is available on device
         coordinatesReadyOnDeviceEvent->enqueueWaitEvent(nonLocalStream_);
     }
 
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEX);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
 
     // launch kernel to pack send buffer
     KernelLaunchConfig config;
@@ -328,12 +328,12 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
                 kernelFn, config, nonLocalStream_, nullptr, "Domdec GPU Apply X Halo Exchange", kernelArgs);
     }
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEX);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveX);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 
     // Consider time spent in communicateHaloData as Comm.X counter
     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
-    wallcycle_start(wcycle_, ewcMOVEX);
+    wallcycle_start(wcycle_, WallCycleCounter::MoveX);
 
     // wait for remote co-ordinates is implicit with process-MPI as non-local stream is synchronized before MPI calls
     // and MPI_Waitall call makes sure both neighboring ranks' non-local stream is synchronized before data transfer is initiated
@@ -345,7 +345,7 @@ void GpuHaloExchange::Impl::communicateHaloCoordinates(const matrix          box
     float3* recvPtr = GMX_THREAD_MPI ? remoteXPtr_ : &d_x_[atomOffset_];
     communicateHaloData(d_sendBuf_, xSendSize_, sendRankX_, recvPtr, xRecvSize_, recvRankX_);
 
-    wallcycle_stop(wcycle_, ewcMOVEX);
+    wallcycle_stop(wcycle_, WallCycleCounter::MoveX);
 
     return;
 }
@@ -356,17 +356,17 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
 {
     // Consider time spent in communicateHaloData as Comm.F counter
     // ToDo: We need further refinement here as communicateHaloData includes launch time for cudamemcpyasync
-    wallcycle_start(wcycle_, ewcMOVEF);
+    wallcycle_start(wcycle_, WallCycleCounter::MoveF);
 
     float3* recvPtr = GMX_THREAD_MPI ? remoteFPtr_ : d_recvBuf_;
 
     // Communicate halo data (in non-local stream)
     communicateHaloData(&(d_f_[atomOffset_]), fSendSize_, sendRankF_, recvPtr, fRecvSize_, recvRankF_);
 
-    wallcycle_stop(wcycle_, ewcMOVEF);
+    wallcycle_stop(wcycle_, WallCycleCounter::MoveF);
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_MOVEF);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
 
     float3* d_f = d_f_;
     // If this is the last pulse and index (noting the force halo
@@ -422,8 +422,8 @@ void GpuHaloExchange::Impl::communicateHaloForces(bool accumulateForces)
         fReadyOnDevice_.markEvent(nonLocalStream_);
     }
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_MOVEF);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuMoveF);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 void GpuHaloExchange::Impl::communicateHaloData(float3* sendPtr,
diff --git a/src/gromacs/domdec/partition.cpp b/src/gromacs/domdec/partition.cpp
index c87a3077fd..2aa7c35c97 100644
--- a/src/gromacs/domdec/partition.cpp
+++ b/src/gromacs/domdec/partition.cpp
@@ -754,7 +754,7 @@ static void comm_dd_ns_cell_sizes(gmx_domdec_t* dd, gmx_ddbox_t* ddbox, rvec cel
 }
 
 //! Compute and communicate to determine the load distribution across PP ranks.
-static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
+static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle* wcycle)
 {
     gmx_domdec_comm_t* comm;
     domdec_load_t*     load;
@@ -766,7 +766,7 @@ static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
         fprintf(debug, "get_load_distribution start\n");
     }
 
-    wallcycle_start(wcycle, ewcDDCOMMLOAD);
+    wallcycle_start(wcycle, WallCycleCounter::DDCommLoad);
 
     comm = dd->comm;
 
@@ -937,7 +937,7 @@ static void get_load_distribution(gmx_domdec_t* dd, gmx_wallcycle_t wcycle)
         }
     }
 
-    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
+    wallcycle_stop(wcycle, WallCycleCounter::DDCommLoad);
 
     if (debug)
     {
@@ -2760,7 +2760,7 @@ void dd_partition_system(FILE*                     fplog,
     int         ncgindex_set;
     char        sbuf[22];
 
-    wallcycle_start(wcycle, ewcDOMDEC);
+    wallcycle_start(wcycle, WallCycleCounter::Domdec);
 
     gmx_domdec_t*      dd   = cr->dd;
     gmx_domdec_comm_t* comm = dd->comm;
@@ -3059,7 +3059,7 @@ void dd_partition_system(FILE*                     fplog,
     int ncg_moved = 0;
     if (bRedist)
     {
-        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
+        wallcycle_sub_start(wcycle, WallCycleSubCounter::DDRedist);
 
         ncgindex_set = dd->ncg_home;
         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir, state_local, fr, nrnb, &ncg_moved);
@@ -3073,7 +3073,7 @@ void dd_partition_system(FILE*                     fplog,
                     state_local->x);
         }
 
-        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDRedist);
     }
 
     RVec cell_ns_x0, cell_ns_x1;
@@ -3095,7 +3095,7 @@ void dd_partition_system(FILE*                     fplog,
 
     if (bSortCG)
     {
-        wallcycle_sub_start(wcycle, ewcsDD_GRID);
+        wallcycle_sub_start(wcycle, WallCycleSubCounter::DDGrid);
 
         /* Sort the state on charge group position.
          * This enables exact restarts from this step.
@@ -3136,7 +3136,7 @@ void dd_partition_system(FILE*                     fplog,
         dd->ga2la->clear();
         ncgindex_set = 0;
 
-        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDGrid);
     }
     else
     {
@@ -3157,7 +3157,7 @@ void dd_partition_system(FILE*                     fplog,
         comm->updateGroupsCog->clear();
     }
 
-    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
+    wallcycle_sub_start(wcycle, WallCycleSubCounter::DDSetupComm);
 
     /* Set the induces for the home atoms */
     set_zones_ncg_home(dd);
@@ -3175,14 +3175,14 @@ void dd_partition_system(FILE*                     fplog,
     /* When bSortCG=true, we have already set the size for zone 0 */
     set_zones_size(dd, state_local->box, &ddbox, bSortCG ? 1 : 0, comm->zones.n, 0);
 
-    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDSetupComm);
 
     /*
        write_dd_pdb("dd_home",step,"dump",top_global,cr,
                  -1,state_local->x.rvec_array(),state_local->box);
      */
 
-    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
+    wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeTop);
 
     /* Extract a local topology from the global topology */
     IVec numPulses;
@@ -3201,9 +3201,9 @@ void dd_partition_system(FILE*                     fplog,
                       top_global,
                       top_local);
 
-    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeTop);
 
-    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
+    wallcycle_sub_start(wcycle, WallCycleSubCounter::DDMakeConstr);
 
     /* Set up the special atom communication */
     int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
@@ -3238,9 +3238,9 @@ void dd_partition_system(FILE*                     fplog,
         comm->atomRanges.setEnd(range, n);
     }
 
-    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDMakeConstr);
 
-    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
+    wallcycle_sub_start(wcycle, WallCycleSubCounter::DDTopOther);
 
     /* Make space for the extra coordinates for virtual site
      * or constraint communication.
@@ -3325,11 +3325,11 @@ void dd_partition_system(FILE*                     fplog,
      */
     dd_move_x_vsites(*dd, state_local->box, state_local->x.rvec_array());
 
-    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::DDTopOther);
 
     if (comm->ddSettings.nstDDDump > 0 && step % comm->ddSettings.nstDDDump == 0)
     {
-        dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
+        dd_move_x(dd, state_local->box, state_local->x, nullptr);
         write_dd_pdb("dd_dump",
                      step,
                      "dump",
@@ -3361,7 +3361,7 @@ void dd_partition_system(FILE*                     fplog,
         check_index_consistency(dd, top_global.natoms, "after partitioning");
     }
 
-    wallcycle_stop(wcycle, ewcDOMDEC);
+    wallcycle_stop(wcycle, WallCycleCounter::Domdec);
 }
 
 } // namespace gmx
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp
index bd96d59070..5f9ffb42d9 100644
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -1202,10 +1202,10 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
         }
         else
         {
-            wallcycle_start(wcycle, ewcPME_REDISTXF);
+            wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF);
             do_redist_pos_coeffs(pme, cr, bFirst, coordinates, coefficient);
 
-            wallcycle_stop(wcycle, ewcPME_REDISTXF);
+            wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF);
         }
 
         if (debug)
@@ -1213,7 +1213,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
             fprintf(debug, "Rank= %6d, pme local particles=%6d\n", cr->nodeid, atc.numAtoms());
         }
 
-        wallcycle_start(wcycle, ewcPME_SPREAD);
+        wallcycle_start(wcycle, WallCycleCounter::PmeSpread);
 
         /* Spread the coefficients on a grid */
         spread_on_grid(pme, &atc, pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
@@ -1237,7 +1237,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
             copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
         }
 
-        wallcycle_stop(wcycle, ewcPME_SPREAD);
+        wallcycle_stop(wcycle, WallCycleCounter::PmeSpread);
 
         /* TODO If the OpenMP and single-threaded implementations
            converge, then spread_on_grid() and
@@ -1256,18 +1256,20 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                 /* do 3d-fft */
                 if (thread == 0)
                 {
-                    wallcycle_start(wcycle, ewcPME_FFT);
+                    wallcycle_start(wcycle, WallCycleCounter::PmeFft);
                 }
                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX, thread, wcycle);
                 if (thread == 0)
                 {
-                    wallcycle_stop(wcycle, ewcPME_FFT);
+                    wallcycle_stop(wcycle, WallCycleCounter::PmeFft);
                 }
 
                 /* solve in k-space for our local cells */
                 if (thread == 0)
                 {
-                    wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+                    wallcycle_start(
+                            wcycle,
+                            (grid_index < DO_Q ? WallCycleCounter::PmeSolve : WallCycleCounter::LJPme));
                 }
                 if (grid_index < DO_Q)
                 {
@@ -1292,19 +1294,21 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
 
                 if (thread == 0)
                 {
-                    wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+                    wallcycle_stop(
+                            wcycle,
+                            (grid_index < DO_Q ? WallCycleCounter::PmeSolve : WallCycleCounter::LJPme));
                     inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
                 }
 
                 /* do 3d-invfft */
                 if (thread == 0)
                 {
-                    wallcycle_start(wcycle, ewcPME_FFT);
+                    wallcycle_start(wcycle, WallCycleCounter::PmeFft);
                 }
                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL, thread, wcycle);
                 if (thread == 0)
                 {
-                    wallcycle_stop(wcycle, ewcPME_FFT);
+                    wallcycle_stop(wcycle, WallCycleCounter::PmeFft);
 
 
                     if (pme->nodeid == 0)
@@ -1317,7 +1321,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                     /* Note: this wallcycle region is closed below
                        outside an OpenMP region, so take care if
                        refactoring code here. */
-                    wallcycle_start(wcycle, ewcPME_GATHER);
+                    wallcycle_start(wcycle, WallCycleCounter::PmeGather);
                 }
 
                 copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
@@ -1366,7 +1370,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
             inc_nrnb(nrnb, eNR_GATHERFBSP, pme->pme_order * pme->pme_order * pme->pme_order * atc.numAtoms());
             /* Note: this wallcycle region is opened above inside an OpenMP
                region, so take care if refactoring code here. */
-            wallcycle_stop(wcycle, ewcPME_GATHER);
+            wallcycle_stop(wcycle, WallCycleCounter::PmeGather);
         }
 
         if (computeEnergyAndVirial)
@@ -1431,7 +1435,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                         break;
                     default: gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
                 }
-                wallcycle_start(wcycle, ewcPME_REDISTXF);
+                wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF);
 
                 do_redist_pos_coeffs(pme, cr, bFirst, coordinates, RedistC6);
                 pme->lb_buf1.resize(atc.numAtoms());
@@ -1449,7 +1453,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                     local_sigma[i] = atc.coefficient[i];
                 }
 
-                wallcycle_stop(wcycle, ewcPME_REDISTXF);
+                wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF);
             }
             atc.coefficient = coefficientBuffer;
             calc_initial_lb_coeffs(coefficientBuffer, local_c6, local_sigma);
@@ -1464,7 +1468,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                 calc_next_lb_coeffs(coefficientBuffer, local_sigma);
                 grid = pmegrid->grid.grid;
 
-                wallcycle_start(wcycle, ewcPME_SPREAD);
+                wallcycle_start(wcycle, WallCycleCounter::PmeSpread);
                 /* Spread the c6 on a grid */
                 spread_on_grid(pme, &atc, pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
 
@@ -1486,7 +1490,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                     }
                     copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
                 }
-                wallcycle_stop(wcycle, ewcPME_SPREAD);
+                wallcycle_stop(wcycle, WallCycleCounter::PmeSpread);
 
                 /*Here we start a large thread parallel region*/
 #pragma omp parallel num_threads(pme->nthread) private(thread)
@@ -1497,13 +1501,13 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                         /* do 3d-fft */
                         if (thread == 0)
                         {
-                            wallcycle_start(wcycle, ewcPME_FFT);
+                            wallcycle_start(wcycle, WallCycleCounter::PmeFft);
                         }
 
                         gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX, thread, wcycle);
                         if (thread == 0)
                         {
-                            wallcycle_stop(wcycle, ewcPME_FFT);
+                            wallcycle_stop(wcycle, WallCycleCounter::PmeFft);
                         }
                     }
                     GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
@@ -1519,7 +1523,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                     thread = gmx_omp_get_thread_num();
                     if (thread == 0)
                     {
-                        wallcycle_start(wcycle, ewcLJPME);
+                        wallcycle_start(wcycle, WallCycleCounter::LJPme);
                     }
 
                     loop_count =
@@ -1532,7 +1536,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                                              thread);
                     if (thread == 0)
                     {
-                        wallcycle_stop(wcycle, ewcLJPME);
+                        wallcycle_stop(wcycle, WallCycleCounter::LJPme);
                         inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
                     }
                 }
@@ -1565,13 +1569,13 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                         /* do 3d-invfft */
                         if (thread == 0)
                         {
-                            wallcycle_start(wcycle, ewcPME_FFT);
+                            wallcycle_start(wcycle, WallCycleCounter::PmeFft);
                         }
 
                         gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL, thread, wcycle);
                         if (thread == 0)
                         {
-                            wallcycle_stop(wcycle, ewcPME_FFT);
+                            wallcycle_stop(wcycle, WallCycleCounter::PmeFft);
 
 
                             if (pme->nodeid == 0)
@@ -1580,7 +1584,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                                 npme      = static_cast<int>(ntot * std::log(ntot) / std::log(2.0));
                                 inc_nrnb(nrnb, eNR_FFT, 2 * npme);
                             }
-                            wallcycle_start(wcycle, ewcPME_GATHER);
+                            wallcycle_start(wcycle, WallCycleCounter::PmeGather);
                         }
 
                         copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
@@ -1619,7 +1623,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
                              eNR_GATHERFBSP,
                              pme->pme_order * pme->pme_order * pme->pme_order * pme->atc[0].numAtoms());
                 }
-                wallcycle_stop(wcycle, ewcPME_GATHER);
+                wallcycle_stop(wcycle, WallCycleCounter::PmeGather);
 
                 bFirst = FALSE;
             } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
@@ -1628,7 +1632,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
 
     if (stepWork.computeForces && pme->nnodes > 1)
     {
-        wallcycle_start(wcycle, ewcPME_REDISTXF);
+        wallcycle_start(wcycle, WallCycleCounter::PmeRedistXF);
         for (d = 0; d < pme->ndecompdim; d++)
         {
             gmx::ArrayRef<gmx::RVec> forcesRef;
@@ -1648,7 +1652,7 @@ int gmx_pme_do(struct gmx_pme_t*              pme,
             }
         }
 
-        wallcycle_stop(wcycle, ewcPME_REDISTXF);
+        wallcycle_stop(wcycle, WallCycleCounter::PmeRedistXF);
     }
 
     if (computeEnergyAndVirial)
diff --git a/src/gromacs/ewald/pme_gpu.cpp b/src/gromacs/ewald/pme_gpu.cpp
index 225fb1050a..564e213af9 100644
--- a/src/gromacs/ewald/pme_gpu.cpp
+++ b/src/gromacs/ewald/pme_gpu.cpp
@@ -123,25 +123,25 @@ int pme_gpu_get_block_size(const gmx_pme_t* pme)
 void inline parallel_3dfft_execute_gpu_wrapper(gmx_pme_t*             pme,
                                                const int              gridIndex,
                                                enum gmx_fft_direction dir,
-                                               gmx_wallcycle_t        wcycle)
+                                               gmx_wallcycle*         wcycle)
 {
     if (pme_gpu_settings(pme->gpu).performGPUFFT)
     {
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+        wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
         pme_gpu_3dfft(pme->gpu, dir, gridIndex);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+        wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
     }
     else
     {
-        wallcycle_start(wcycle, ewcPME_FFT_MIXED_MODE);
+        wallcycle_start(wcycle, WallCycleCounter::PmeFftMixedMode);
 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
         for (int thread = 0; thread < pme->nthread; thread++)
         {
             gmx_parallel_3dfft_execute(pme->pfft_setup[gridIndex], dir, thread, wcycle);
         }
-        wallcycle_stop(wcycle, ewcPME_FFT_MIXED_MODE);
+        wallcycle_stop(wcycle, WallCycleCounter::PmeFftMixedMode);
     }
 }
 
@@ -172,11 +172,11 @@ void pme_gpu_prepare_computation(gmx_pme_t*               pme,
 
     if (stepWork.haveDynamicBox || shouldUpdateBox) // || is to make the first computation always update
     {
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+        wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
         pme_gpu_update_input_box(pmeGpu, box);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+        wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 
         if (!pme_gpu_settings(pmeGpu).performGPUSolve)
         {
@@ -213,11 +213,11 @@ void pme_gpu_launch_spread(gmx_pme_t*            pme,
     /* Spread the coefficients on a grid */
     const bool computeSplines = true;
     const bool spreadCharges  = true;
-    wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-    wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+    wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
     pme_gpu_spread(pmeGpu, xReadyOnDevice, fftgrids, computeSplines, spreadCharges, lambdaQ);
-    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+    wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 }
 
 void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, const gmx::StepWorkload& stepWork)
@@ -228,9 +228,9 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, co
     const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
     if (!settings.performGPUFFT)
     {
-        wallcycle_start(wcycle, ewcWAIT_GPU_PME_SPREAD);
+        wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeSpread);
         pme_gpu_sync_spread_grid(pme->gpu);
-        wallcycle_stop(wcycle, ewcWAIT_GPU_PME_SPREAD);
+        wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeSpread);
     }
 
     try
@@ -248,21 +248,21 @@ void pme_gpu_launch_complex_transforms(gmx_pme_t* pme, gmx_wallcycle* wcycle, co
             {
                 const auto gridOrdering =
                         settings.useDecomposition ? GridOrdering::YZX : GridOrdering::XYZ;
-                wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-                wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+                wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+                wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
                 pme_gpu_solve(pmeGpu, gridIndex, cfftgrid, gridOrdering, computeEnergyAndVirial);
-                wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-                wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+                wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+                wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
             }
             else
             {
-                wallcycle_start(wcycle, ewcPME_SOLVE_MIXED_MODE);
+                wallcycle_start(wcycle, WallCycleCounter::PmeSolveMixedMode);
 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
                 for (int thread = 0; thread < pme->nthread; thread++)
                 {
                     solve_pme_yzx(pme, cfftgrid, pme->boxVolume, computeEnergyAndVirial, pme->nthread, thread);
                 }
-                wallcycle_stop(wcycle, ewcPME_SOLVE_MIXED_MODE);
+                wallcycle_stop(wcycle, WallCycleCounter::PmeSolveMixedMode);
             }
 
             parallel_3dfft_execute_gpu_wrapper(pme, gridIndex, GMX_FFT_COMPLEX_TO_REAL, wcycle);
@@ -280,13 +280,13 @@ void pme_gpu_launch_gather(const gmx_pme_t* pme, gmx_wallcycle gmx_unused* wcycl
         return;
     }
 
-    wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-    wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+    wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
 
     float** fftgrids = pme->fftgrid;
     pme_gpu_gather(pme->gpu, fftgrids, lambdaQ);
-    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+    wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 }
 
 //! Accumulate the \c forcesToAdd to \c f, using the available threads.
@@ -309,7 +309,7 @@ static void pme_gpu_reduce_outputs(const bool            computeEnergyAndVirial,
                                    gmx::ForceWithVirial* forceWithVirial,
                                    gmx_enerdata_t*       enerd)
 {
-    wallcycle_start(wcycle, ewcPME_GPU_F_REDUCTION);
+    wallcycle_start(wcycle, WallCycleCounter::PmeGpuFReduction);
     GMX_ASSERT(forceWithVirial, "Invalid force pointer");
 
     if (computeEnergyAndVirial)
@@ -323,7 +323,7 @@ static void pme_gpu_reduce_outputs(const bool            computeEnergyAndVirial,
     {
         sum_forces(forceWithVirial->force_, output.forces_);
     }
-    wallcycle_stop(wcycle, ewcPME_GPU_F_REDUCTION);
+    wallcycle_stop(wcycle, WallCycleCounter::PmeGpuFReduction);
 }
 
 bool pme_gpu_try_finish_task(gmx_pme_t*               pme,
@@ -348,11 +348,11 @@ bool pme_gpu_try_finish_task(gmx_pme_t*               pme,
     // TODO: implement c_streamQuerySupported with an additional GpuEventSynchronizer per stream (#2521)
     if ((completionKind == GpuTaskCompletion::Check) && c_streamQuerySupported)
     {
-        wallcycle_start_nocount(wcycle, ewcWAIT_GPU_PME_GATHER);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuPmeGather);
         // Query the PME stream for completion of all tasks enqueued and
         // if we're not done, stop the timer before early return.
         const bool pmeGpuDone = pme_gpu_stream_query(pme->gpu);
-        wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+        wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
 
         if (!pmeGpuDone)
         {
@@ -361,7 +361,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t*               pme,
         needToSynchronize = false;
     }
 
-    wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+    wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeGather);
     // If the above check passed, then there is no need to make an
     // explicit synchronization call.
     if (needToSynchronize)
@@ -374,7 +374,7 @@ bool pme_gpu_try_finish_task(gmx_pme_t*               pme,
     const bool computeEnergyAndVirial = stepWork.computeEnergy || stepWork.computeVirial;
     PmeOutput  output                 = pme_gpu_getOutput(
             *pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
-    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+    wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
 
     GMX_ASSERT(pme->gpu->settings.useGpuForceReduction == !output.haveForceOutput_,
                "When forces are reduced on the CPU, there needs to be force output");
@@ -391,7 +391,7 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t*     pme,
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 
-    wallcycle_start(wcycle, ewcWAIT_GPU_PME_GATHER);
+    wallcycle_start(wcycle, WallCycleCounter::WaitGpuPmeGather);
 
     // Synchronize the whole PME stream at once, including D2H result transfers
     // if there are outputs we need to wait for at this step; we still call getOutputs
@@ -403,7 +403,7 @@ PmeOutput pme_gpu_wait_finish_task(gmx_pme_t*     pme,
 
     PmeOutput output = pme_gpu_getOutput(
             *pme, computeEnergyAndVirial, pme->gpu->common->ngrids > 1 ? lambdaQ : 1.0);
-    wallcycle_stop(wcycle, ewcWAIT_GPU_PME_GATHER);
+    wallcycle_stop(wcycle, WallCycleCounter::WaitGpuPmeGather);
     return output;
 }
 
@@ -428,16 +428,16 @@ void pme_gpu_reinit_computation(const gmx_pme_t* pme, gmx_wallcycle* wcycle)
 {
     GMX_ASSERT(pme_gpu_active(pme), "This should be a GPU run of PME but it is not enabled.");
 
-    wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-    wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_PME);
+    wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuPme);
 
     pme_gpu_update_timings(pme->gpu);
 
     pme_gpu_clear_grids(pme->gpu);
     pme_gpu_clear_energy_virial(pme->gpu);
 
-    wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_PME);
-    wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuPme);
+    wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 }
 
 DeviceBuffer<gmx::RVec> pme_gpu_get_device_f(const gmx_pme_t* pme)
diff --git a/src/gromacs/ewald/pme_load_balancing.cpp b/src/gromacs/ewald/pme_load_balancing.cpp
index 19778b8999..2e900d3bd4 100644
--- a/src/gromacs/ewald/pme_load_balancing.cpp
+++ b/src/gromacs/ewald/pme_load_balancing.cpp
@@ -928,7 +928,7 @@ void pme_loadbal_do(pme_load_balancing_t*          pme_lb,
                     t_forcerec*                    fr,
                     const matrix                   box,
                     gmx::ArrayRef<const gmx::RVec> x,
-                    gmx_wallcycle_t                wcycle,
+                    gmx_wallcycle*                 wcycle,
                     int64_t                        step,
                     int64_t                        step_rel,
                     gmx_bool*                      bPrinting,
@@ -946,7 +946,7 @@ void pme_loadbal_do(pme_load_balancing_t*          pme_lb,
 
     n_prev      = pme_lb->cycles_n;
     cycles_prev = pme_lb->cycles_c;
-    wallcycle_get(wcycle, ewcSTEP, &pme_lb->cycles_n, &pme_lb->cycles_c);
+    wallcycle_get(wcycle, WallCycleCounter::Step, &pme_lb->cycles_n, &pme_lb->cycles_c);
 
     /* Before the first step we haven't done any steps yet.
      * Also handle cases where ir.init_step % ir.nstlist != 0.
diff --git a/src/gromacs/ewald/pme_load_balancing.h b/src/gromacs/ewald/pme_load_balancing.h
index bb98635ca2..38ba5c74e6 100644
--- a/src/gromacs/ewald/pme_load_balancing.h
+++ b/src/gromacs/ewald/pme_load_balancing.h
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -92,7 +92,7 @@ void pme_loadbal_init(pme_load_balancing_t**     pme_lb_p,
  *
  * Process the cycles measured over the last nstlist steps and then
  * either continue balancing or check if we need to trigger balancing.
- * Should be called after the ewcSTEP cycle counter has been stopped.
+ * Should be called after the WallCycleCounter::Step cycle counter has been stopped.
  * Returns if the load balancing is printing to fp_err.
  */
 void pme_loadbal_do(pme_load_balancing_t*          pme_lb,
@@ -104,7 +104,7 @@ void pme_loadbal_do(pme_load_balancing_t*          pme_lb,
                     t_forcerec*                    fr,
                     const matrix                   box,
                     gmx::ArrayRef<const gmx::RVec> x,
-                    gmx_wallcycle_t                wcycle,
+                    gmx_wallcycle*                 wcycle,
                     int64_t                        step,
                     int64_t                        step_rel,
                     gmx_bool*                      bPrinting,
diff --git a/src/gromacs/ewald/pme_only.cpp b/src/gromacs/ewald/pme_only.cpp
index 4542a05f42..5130034a9f 100644
--- a/src/gromacs/ewald/pme_only.cpp
+++ b/src/gromacs/ewald/pme_only.cpp
@@ -167,17 +167,17 @@ static std::unique_ptr<gmx_pme_pp> gmx_pme_pp_init(const t_commrec* cr)
     return pme_pp;
 }
 
-static void reset_pmeonly_counters(gmx_wallcycle_t           wcycle,
+static void reset_pmeonly_counters(gmx_wallcycle*            wcycle,
                                    gmx_walltime_accounting_t walltime_accounting,
                                    t_nrnb*                   nrnb,
                                    int64_t                   step,
                                    bool                      useGpuForPme)
 {
     /* Reset all the counters related to performance over the run */
-    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_stop(wcycle, WallCycleCounter::Run);
     wallcycle_reset_all(wcycle);
     *nrnb = { 0 };
-    wallcycle_start(wcycle, ewcRUN);
+    wallcycle_start(wcycle, WallCycleCounter::Run);
     walltime_accounting_reset_time(walltime_accounting, step);
 
     if (useGpuForPme)
@@ -708,11 +708,11 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
 
         if (count == 0)
         {
-            wallcycle_start(wcycle, ewcRUN);
+            wallcycle_start(wcycle, WallCycleCounter::Run);
             walltime_accounting_start_time(walltime_accounting);
         }
 
-        wallcycle_start(wcycle, ewcPMEMESH);
+        wallcycle_start(wcycle, WallCycleCounter::PmeMesh);
 
         dvdlambda_q  = 0;
         dvdlambda_lj = 0;
@@ -779,7 +779,7 @@ int gmx_pmeonly(struct gmx_pme_t*               pme,
             output.forces_ = pme_pp->f;
         }
 
-        cycles = wallcycle_stop(wcycle, ewcPMEMESH);
+        cycles = wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
         gmx_pme_send_force_vir_ener(pme_pp.get(), output, dvdlambda_q, dvdlambda_lj, cycles);
 
         count++;
diff --git a/src/gromacs/ewald/pme_pp.cpp b/src/gromacs/ewald/pme_pp.cpp
index 3a929daf1f..63693ed6a6 100644
--- a/src/gromacs/ewald/pme_pp.cpp
+++ b/src/gromacs/ewald/pme_pp.cpp
@@ -371,7 +371,7 @@ void gmx_pme_send_coordinates(t_forcerec*           fr,
                               GpuEventSynchronizer* coordinatesReadyOnDeviceEvent,
                               gmx_wallcycle*        wcycle)
 {
-    wallcycle_start(wcycle, ewcPP_PMESENDX);
+    wallcycle_start(wcycle, WallCycleCounter::PpPmeSendX);
 
     unsigned int flags = PP_PME_COORD;
     if (computeEnergyAndVirial)
@@ -399,7 +399,7 @@ void gmx_pme_send_coordinates(t_forcerec*           fr,
                                sendCoordinatesFromGpu,
                                coordinatesReadyOnDeviceEvent);
 
-    wallcycle_stop(wcycle, ewcPP_PMESENDX);
+    wallcycle_stop(wcycle, WallCycleCounter::PpPmeSendX);
 }
 
 void gmx_pme_send_finish(const t_commrec* cr)
diff --git a/src/gromacs/fft/fft5d.cpp b/src/gromacs/fft/fft5d.cpp
index 71020eec6b..be74cbaf6d 100644
--- a/src/gromacs/fft/fft5d.cpp
+++ b/src/gromacs/fft/fft5d.cpp
@@ -1286,7 +1286,7 @@ void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
                     time = MPI_Wtime();
                 }
 #else
-                wallcycle_start(times, ewcPME_FFTCOMM);
+                wallcycle_start(times, WallCycleCounter::PmeFftComm);
 #endif
 #ifdef FFT5D_MPI_TRANSPOSE
                 FFTW(execute)(mpip[s]);
@@ -1323,7 +1323,7 @@ void fft5d_execute(fft5d_plan plan, int thread, fft5d_time times)
                     time_mpi[s] = MPI_Wtime() - time;
                 }
 #else
-                wallcycle_stop(times, ewcPME_FFTCOMM);
+                wallcycle_stop(times, WallCycleCounter::PmeFftComm);
 #endif
             }       /*master*/
         }           /* bPrallelDim */
diff --git a/src/gromacs/fft/fft5d.h b/src/gromacs/fft/fft5d.h
index 71d86817a0..b4e5d008e4 100644
--- a/src/gromacs/fft/fft5d.h
+++ b/src/gromacs/fft/fft5d.h
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2009-2017, The GROMACS development team.
- * Copyright (c) 2019, by the GROMACS development team, led by
+ * Copyright (c) 2019,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -73,7 +73,7 @@ struct fft5d_time_t
 typedef struct fft5d_time_t* fft5d_time;
 #else
 #    include "gromacs/timing/wallcycle.h"
-typedef gmx_wallcycle_t fft5d_time;
+typedef gmx_wallcycle* fft5d_time;
 #endif
 
 namespace gmx
diff --git a/src/gromacs/fft/parallel_3dfft.cpp b/src/gromacs/fft/parallel_3dfft.cpp
index 24a679fc30..33d916ec20 100644
--- a/src/gromacs/fft/parallel_3dfft.cpp
+++ b/src/gromacs/fft/parallel_3dfft.cpp
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 1991-2005 David van der Spoel, Erik Lindahl, University of Groningen.
- * Copyright (c) 2013,2014,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -170,7 +170,7 @@ int gmx_parallel_3dfft_complex_limits(gmx_parallel_3dfft_t pfft_setup,
 int gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t   pfft_setup,
                                enum gmx_fft_direction dir,
                                int                    thread,
-                               gmx_wallcycle_t        wcycle)
+                               gmx_wallcycle*         wcycle)
 {
     if (((pfft_setup->p1->flags & FFT5D_REALCOMPLEX) == 0)
         ^ (dir == GMX_FFT_FORWARD || dir == GMX_FFT_BACKWARD))
diff --git a/src/gromacs/fft/parallel_3dfft.h b/src/gromacs/fft/parallel_3dfft.h
index 91cd66103f..5652f4c422 100644
--- a/src/gromacs/fft/parallel_3dfft.h
+++ b/src/gromacs/fft/parallel_3dfft.h
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 1991-2005 David van der Spoel, Erik Lindahl, University of Groningen.
- * Copyright (c) 2013,2014,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2013,2014,2017,2018,2019,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -104,7 +104,7 @@ int gmx_parallel_3dfft_complex_limits(gmx_parallel_3dfft_t pfft_setup,
 int gmx_parallel_3dfft_execute(gmx_parallel_3dfft_t   pfft_setup,
                                enum gmx_fft_direction dir,
                                int                    thread,
-                               gmx_wallcycle_t        wcycle);
+                               gmx_wallcycle*         wcycle);
 
 
 /*! \brief Release all data in parallel fft setup
diff --git a/src/gromacs/imd/imd.cpp b/src/gromacs/imd/imd.cpp
index cf475be7cc..33285ca106 100644
--- a/src/gromacs/imd/imd.cpp
+++ b/src/gromacs/imd/imd.cpp
@@ -1525,7 +1525,7 @@ bool ImdSession::Impl::run(int64_t step, bool bNS, const matrix box, gmx::ArrayR
         return false;
     }
 
-    wallcycle_start(wcycle, ewcIMD);
+    wallcycle_start(wcycle, WallCycleCounter::Imd);
 
     /* read command from client and check if new incoming connection */
     if (MASTER(cr))
@@ -1576,7 +1576,7 @@ bool ImdSession::Impl::run(int64_t step, bool bNS, const matrix box, gmx::ArrayR
         }
     }
 
-    wallcycle_stop(wcycle, ewcIMD);
+    wallcycle_stop(wcycle, WallCycleCounter::Imd);
 
     return imdstep;
 }
@@ -1641,7 +1641,7 @@ void ImdSession::updateEnergyRecordAndSendPositionsAndEnergies(bool bIMDstep, in
         return;
     }
 
-    wallcycle_start(impl_->wcycle, ewcIMD);
+    wallcycle_start(impl_->wcycle, WallCycleCounter::Imd);
 
     /* Update time step for IMD and prepare IMD energy record if we have new energies. */
     fillEnergyRecord(step, bHaveNewEnergies);
@@ -1652,7 +1652,7 @@ void ImdSession::updateEnergyRecordAndSendPositionsAndEnergies(bool bIMDstep, in
         sendPositionsAndEnergies();
     }
 
-    wallcycle_stop(impl_->wcycle, ewcIMD);
+    wallcycle_stop(impl_->wcycle, WallCycleCounter::Imd);
 }
 
 void ImdSession::applyForces(gmx::ArrayRef<gmx::RVec> force)
@@ -1662,7 +1662,7 @@ void ImdSession::applyForces(gmx::ArrayRef<gmx::RVec> force)
         return;
     }
 
-    wallcycle_start(impl_->wcycle, ewcIMD);
+    wallcycle_start(impl_->wcycle, WallCycleCounter::Imd);
 
     for (int i = 0; i < impl_->nforces; i++)
     {
@@ -1680,7 +1680,7 @@ void ImdSession::applyForces(gmx::ArrayRef<gmx::RVec> force)
         rvec_inc(force[j], impl_->f[i]);
     }
 
-    wallcycle_stop(impl_->wcycle, ewcIMD);
+    wallcycle_stop(impl_->wcycle, WallCycleCounter::Imd);
 }
 
 ImdSession::ImdSession(const MDLogger& mdlog) : impl_(new Impl(mdlog)) {}
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cu b/src/gromacs/listed_forces/gpubonded_impl.cu
index 4a8bbe41ce..c5fbd00e46 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cu
+++ b/src/gromacs/listed_forces/gpubonded_impl.cu
@@ -311,11 +311,11 @@ void GpuBonded::Impl::launchEnergyTransfer()
     GMX_ASSERT(haveInteractions_,
                "No GPU bonded interactions, so no energies will be computed, so transfer should "
                "not be called");
-    wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
+    wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
     // TODO add conditional on whether there has been any compute (and make sure host buffer doesn't contain garbage)
     float* h_vTot = vTot_.data();
     copyFromDeviceBuffer(h_vTot, &d_vTot_, 0, F_NRE, deviceStream_, GpuApiCallBehavior::Async, nullptr);
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
 }
 
 void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd)
@@ -324,10 +324,10 @@ void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd)
                "No GPU bonded interactions, so no energies will be computed or transferred, so "
                "accumulation should not occur");
 
-    wallcycle_start(wcycle_, ewcWAIT_GPU_BONDED);
+    wallcycle_start(wcycle_, WallCycleCounter::WaitGpuBonded);
     cudaError_t stat = cudaStreamSynchronize(deviceStream_.stream());
     CU_RET_ERR(stat, "D2H transfer of bonded energies failed");
-    wallcycle_stop(wcycle_, ewcWAIT_GPU_BONDED);
+    wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuBonded);
 
     for (int fType : fTypesOnGpu)
     {
@@ -346,11 +346,11 @@ void GpuBonded::Impl::waitAccumulateEnergyTerms(gmx_enerdata_t* enerd)
 
 void GpuBonded::Impl::clearEnergies()
 {
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_BONDED);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
     clearDeviceBufferAsync(&d_vTot_, 0, F_NRE, deviceStream_);
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 // ---- GpuBonded
diff --git a/src/gromacs/listed_forces/gpubondedkernels.cu b/src/gromacs/listed_forces/gpubondedkernels.cu
index 57fbbc1be4..1537c58d58 100644
--- a/src/gromacs/listed_forces/gpubondedkernels.cu
+++ b/src/gromacs/listed_forces/gpubondedkernels.cu
@@ -914,8 +914,8 @@ void GpuBonded::Impl::launchKernel()
     GMX_ASSERT(haveInteractions_,
                "Cannot launch bonded GPU kernels unless bonded GPU work was scheduled");
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_BONDED);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
 
     int fTypeRangeEnd = kernelParams_.fTypeRangeEnd[numFTypesOnGpu - 1];
 
@@ -935,8 +935,8 @@ void GpuBonded::Impl::launchKernel()
                     "exec_kernel_gpu<calcVir, calcEner>",
                     kernelArgs);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_BONDED);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuBonded);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 void GpuBonded::launchKernel(const gmx::StepWorkload& stepWork)
diff --git a/src/gromacs/listed_forces/listed_forces.cpp b/src/gromacs/listed_forces/listed_forces.cpp
index b0d83be843..a35e58911c 100644
--- a/src/gromacs/listed_forces/listed_forces.cpp
+++ b/src/gromacs/listed_forces/listed_forces.cpp
@@ -657,7 +657,7 @@ void calc_listed(struct gmx_wallcycle*         wcycle,
     {
         gmx::ForceWithShiftForces& forceWithShiftForces = forceOutputs->forceWithShiftForces();
 
-        wallcycle_sub_start(wcycle, ewcsLISTED);
+        wallcycle_sub_start(wcycle, WallCycleSubCounter::Listed);
         /* The dummy array is to have a place to store the dhdl at other values
            of lambda, which will be thrown away in the end */
         gmx::EnumerationArray<FreeEnergyPerturbationCouplingType, real> dvdl = { 0 };
@@ -675,9 +675,9 @@ void calc_listed(struct gmx_wallcycle*         wcycle,
                          fcd,
                          stepWork,
                          global_atom_index);
-        wallcycle_sub_stop(wcycle, ewcsLISTED);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::Listed);
 
-        wallcycle_sub_start(wcycle, ewcsLISTED_BUF_OPS);
+        wallcycle_sub_start(wcycle, WallCycleSubCounter::ListedBufOps);
         reduce_thread_output(&forceWithShiftForces, enerd->term.data(), &enerd->grpp, dvdl, bt, stepWork);
 
         if (stepWork.computeDhdl)
@@ -687,7 +687,7 @@ void calc_listed(struct gmx_wallcycle*         wcycle,
                 enerd->dvdl_nonlin[i] += dvdl[i];
             }
         }
-        wallcycle_sub_stop(wcycle, ewcsLISTED_BUF_OPS);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::ListedBufOps);
     }
 
     /* Copy the sum of violations for the distance restraints from fcd */
@@ -829,7 +829,7 @@ void ListedForces::calculate(struct gmx_wallcycle*                     wcycle,
            awkward to account to this subtimer properly in the present
            code. We don't test / care much about performance with
            restraints, anyway. */
-        wallcycle_sub_start(wcycle, ewcsRESTRAINTS);
+        wallcycle_sub_start(wcycle, WallCycleSubCounter::Restraints);
 
         if (!idef.il[F_POSRES].empty())
         {
@@ -868,7 +868,7 @@ void ListedForces::calculate(struct gmx_wallcycle*                     wcycle,
                             hist);
         }
 
-        wallcycle_sub_stop(wcycle, ewcsRESTRAINTS);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::Restraints);
     }
 
     calc_listed(wcycle, idef, threading_.get(), x, forceOutputs, fr, pbc, enerd, nrnb, lambda, md, fcdata, global_atom_index, stepWork);
@@ -885,7 +885,7 @@ void ListedForces::calculate(struct gmx_wallcycle*                     wcycle,
         }
         if (idef.ilsort != ilsortNO_FE)
         {
-            wallcycle_sub_start(wcycle, ewcsLISTED_FEP);
+            wallcycle_sub_start(wcycle, WallCycleSubCounter::ListedFep);
             if (idef.ilsort != ilsortFE_SORTED)
             {
                 gmx_incons("The bonded interactions are not sorted for free energy");
@@ -919,7 +919,7 @@ void ListedForces::calculate(struct gmx_wallcycle*                     wcycle,
                 std::fill(std::begin(dvdl), std::end(dvdl), 0.0);
                 enerd->foreignLambdaTerms.accumulate(i, enerd->foreign_term[F_EPOT], dvdlSum);
             }
-            wallcycle_sub_stop(wcycle, ewcsLISTED_FEP);
+            wallcycle_sub_stop(wcycle, WallCycleSubCounter::ListedFep);
         }
     }
 }
diff --git a/src/gromacs/listed_forces/position_restraints.cpp b/src/gromacs/listed_forces/position_restraints.cpp
index aa881af20d..165b893208 100644
--- a/src/gromacs/listed_forces/position_restraints.cpp
+++ b/src/gromacs/listed_forces/position_restraints.cpp
@@ -463,7 +463,7 @@ void posres_wrapper_lambda(struct gmx_wallcycle*         wcycle,
                            gmx::ArrayRef<const real>     lambda,
                            const t_forcerec*             fr)
 {
-    wallcycle_sub_start_nocount(wcycle, ewcsRESTRAINTS);
+    wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::Restraints);
 
     auto& foreignTerms = enerd->foreignLambdaTerms;
     for (int i = 0; i < 1 + foreignTerms.numLambdas(); i++)
@@ -487,7 +487,7 @@ void posres_wrapper_lambda(struct gmx_wallcycle*         wcycle,
                                      fr->posres_comB);
         foreignTerms.accumulate(i, v, dvdl);
     }
-    wallcycle_sub_stop(wcycle, ewcsRESTRAINTS);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::Restraints);
 }
 
 /*! \brief Helper function that wraps calls to fbposres for
diff --git a/src/gromacs/mdlib/constr.cpp b/src/gromacs/mdlib/constr.cpp
index 398841179c..dca3379ce8 100644
--- a/src/gromacs/mdlib/constr.cpp
+++ b/src/gromacs/mdlib/constr.cpp
@@ -414,7 +414,7 @@ bool Constraints::Impl::apply(bool                      bLog,
     char  buf[22];
     int   nth;
 
-    wallcycle_start(wcycle, ewcCONSTR);
+    wallcycle_start(wcycle, WallCycleCounter::Constr);
 
     if (econq == ConstraintVariable::ForceDispl && !EI_ENERGY_MINIMIZATION(ir.eI))
     {
@@ -781,7 +781,7 @@ bool Constraints::Impl::apply(bool                      bLog,
             do_edsam(&ir, step, cr, xprime.unpaddedArrayRef(), v.unpaddedArrayRef(), box, ed);
         }
     }
-    wallcycle_stop(wcycle, ewcCONSTR);
+    wallcycle_stop(wcycle, WallCycleCounter::Constr);
 
     const bool haveVelocities = (!v.empty() || econq == ConstraintVariable::Velocities);
     if (haveVelocities && !cFREEZE_.empty())
diff --git a/src/gromacs/mdlib/force.cpp b/src/gromacs/mdlib/force.cpp
index 081d603dc1..ad08c862fb 100644
--- a/src/gromacs/mdlib/force.cpp
+++ b/src/gromacs/mdlib/force.cpp
@@ -104,7 +104,7 @@ void calculateLongRangeNonbondeds(t_forcerec*                    fr,
                                   const t_inputrec&              ir,
                                   const t_commrec*               cr,
                                   t_nrnb*                        nrnb,
-                                  gmx_wallcycle_t                wcycle,
+                                  gmx_wallcycle*                 wcycle,
                                   const t_mdatoms*               md,
                                   gmx::ArrayRef<const RVec>      coordinates,
                                   gmx::ForceWithVirial*          forceWithVirial,
@@ -141,7 +141,7 @@ void calculateLongRangeNonbondeds(t_forcerec*                    fr,
             /* Calculate the Ewald surface force and energy contributions, when necessary */
             if (haveEwaldSurfaceTerm)
             {
-                wallcycle_sub_start(wcycle, ewcsEWALD_CORRECTION);
+                wallcycle_sub_start(wcycle, WallCycleSubCounter::EwaldCorrection);
 
                 int nthreads = fr->nthread_ewc;
 #pragma omp parallel for num_threads(nthreads) schedule(static)
@@ -184,7 +184,7 @@ void calculateLongRangeNonbondeds(t_forcerec*                    fr,
                 {
                     reduceEwaldThreadOuput(nthreads, fr->ewc_t);
                 }
-                wallcycle_sub_stop(wcycle, ewcsEWALD_CORRECTION);
+                wallcycle_sub_stop(wcycle, WallCycleSubCounter::EwaldCorrection);
             }
 
             if (EEL_PME_EWALD(fr->ic->eeltype) && fr->n_tpi == 0)
@@ -212,7 +212,7 @@ void calculateLongRangeNonbondeds(t_forcerec*                    fr,
                      */
                     ddBalanceRegionHandler.closeAfterForceComputationCpu();
 
-                    wallcycle_start(wcycle, ewcPMEMESH);
+                    wallcycle_start(wcycle, WallCycleCounter::PmeMesh);
                     status = gmx_pme_do(
                             fr->pmedata,
                             gmx::constArrayRefFromArray(coordinates.data(), md->homenr - fr->n_tpi),
@@ -238,7 +238,7 @@ void calculateLongRangeNonbondeds(t_forcerec*                    fr,
                             &ewaldOutput.dvdl[FreeEnergyPerturbationCouplingType::Coul],
                             &ewaldOutput.dvdl[FreeEnergyPerturbationCouplingType::Vdw],
                             stepWork);
-                    wallcycle_stop(wcycle, ewcPMEMESH);
+                    wallcycle_stop(wcycle, WallCycleCounter::PmeMesh);
                     if (status != 0)
                     {
                         gmx_fatal(FARGS, "Error %d in reciprocal PME routine", status);
diff --git a/src/gromacs/mdlib/gpuforcereduction_impl.cu b/src/gromacs/mdlib/gpuforcereduction_impl.cu
index 80aad0ed38..471cc1d5c7 100644
--- a/src/gromacs/mdlib/gpuforcereduction_impl.cu
+++ b/src/gromacs/mdlib/gpuforcereduction_impl.cu
@@ -130,7 +130,7 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
     completionMarker_ = completionMarker;
     cellInfo_.cell    = cell.data();
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
     reallocateDeviceBuffer(
             &cellInfo_.d_cell, numAtoms_, &cellInfo_.cellSize, &cellInfo_.cellSizeAlloc, deviceContext_);
     copyToDeviceBuffer(&cellInfo_.d_cell,
@@ -140,7 +140,7 @@ void GpuForceReduction::Impl::reinit(DeviceBuffer<Float3>  baseForcePtr,
                        deviceStream_,
                        GpuApiCallBehavior::Async,
                        nullptr);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 
     dependencyList_.clear();
 };
@@ -164,8 +164,8 @@ void GpuForceReduction::Impl::addDependency(GpuEventSynchronizer* const dependen
 
 void GpuForceReduction::Impl::execute()
 {
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
 
     if (numAtoms_ == 0)
     {
@@ -209,8 +209,8 @@ void GpuForceReduction::Impl::execute()
         completionMarker_->markEvent(deviceStream_);
     }
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_F_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBFBufOps);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 GpuForceReduction::Impl::~Impl(){};
diff --git a/src/gromacs/mdlib/md_support.cpp b/src/gromacs/mdlib/md_support.cpp
index 24ae64beec..b991293a56 100644
--- a/src/gromacs/mdlib/md_support.cpp
+++ b/src/gromacs/mdlib/md_support.cpp
@@ -292,7 +292,7 @@ void compute_globals(gmx_global_stat*               gstat,
                      const t_mdatoms*               mdatoms,
                      t_nrnb*                        nrnb,
                      t_vcm*                         vcm,
-                     gmx_wallcycle_t                wcycle,
+                     gmx_wallcycle*                 wcycle,
                      gmx_enerdata_t*                enerd,
                      tensor                         force_vir,
                      tensor                         shake_vir,
@@ -359,7 +359,7 @@ void compute_globals(gmx_global_stat*               gstat,
             gmx::ArrayRef<real> signalBuffer = signalCoordinator->getCommunicationBuffer();
             if (PAR(cr))
             {
-                wallcycle_start(wcycle, ewcMoveE);
+                wallcycle_start(wcycle, WallCycleCounter::MoveE);
                 global_stat(*gstat,
                             cr,
                             enerd,
@@ -372,7 +372,7 @@ void compute_globals(gmx_global_stat*               gstat,
                             signalBuffer,
                             *bSumEkinhOld,
                             flags);
-                wallcycle_stop(wcycle, ewcMoveE);
+                wallcycle_stop(wcycle, WallCycleCounter::MoveE);
             }
             signalCoordinator->finalizeSignals();
             *bSumEkinhOld = FALSE;
diff --git a/src/gromacs/mdlib/md_support.h b/src/gromacs/mdlib/md_support.h
index 40e1437c13..da949e10d1 100644
--- a/src/gromacs/mdlib/md_support.h
+++ b/src/gromacs/mdlib/md_support.h
@@ -123,7 +123,7 @@ void compute_globals(gmx_global_stat*               gstat,
                      const t_mdatoms*               mdatoms,
                      t_nrnb*                        nrnb,
                      t_vcm*                         vcm,
-                     gmx_wallcycle_t                wcycle,
+                     gmx_wallcycle*                 wcycle,
                      gmx_enerdata_t*                enerd,
                      tensor                         force_vir,
                      tensor                         shake_vir,
diff --git a/src/gromacs/mdlib/mdoutf.cpp b/src/gromacs/mdlib/mdoutf.cpp
index 8f8fd006a4..9769bac325 100644
--- a/src/gromacs/mdlib/mdoutf.cpp
+++ b/src/gromacs/mdlib/mdoutf.cpp
@@ -91,7 +91,7 @@ struct gmx_mdoutf
     int                            natoms_global;
     int                            natoms_x_compressed;
     const SimulationGroups*        groups; /* for compressed position writing */
-    gmx_wallcycle_t                wcycle;
+    gmx_wallcycle*                 wcycle;
     rvec*                          f_global;
     gmx::IMDOutputProvider*        outputProvider;
     const gmx::MDModulesNotifiers* mdModulesNotifiers;
@@ -110,7 +110,7 @@ gmx_mdoutf_t init_mdoutf(FILE*                          fplog,
                          const t_inputrec*              ir,
                          const gmx_mtop_t&              top_global,
                          const gmx_output_env_t*        oenv,
-                         gmx_wallcycle_t                wcycle,
+                         gmx_wallcycle*                 wcycle,
                          const gmx::StartingBehavior    startingBehavior,
                          bool                           simulationsShareState,
                          const gmx_multisim_t*          ms)
@@ -261,7 +261,7 @@ FILE* mdoutf_get_fp_dhdl(gmx_mdoutf_t of)
     return of->fp_dhdl;
 }
 
-gmx_wallcycle_t mdoutf_get_wcycle(gmx_mdoutf_t of)
+gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of)
 {
     return of->wcycle;
 }
@@ -753,10 +753,10 @@ void mdoutf_tng_close(gmx_mdoutf_t of)
 {
     if (of->tng || of->tng_low_prec)
     {
-        wallcycle_start(of->wcycle, ewcTRAJ);
+        wallcycle_start(of->wcycle, WallCycleCounter::Traj);
         gmx_tng_close(&of->tng);
         gmx_tng_close(&of->tng_low_prec);
-        wallcycle_stop(of->wcycle, ewcTRAJ);
+        wallcycle_stop(of->wcycle, WallCycleCounter::Traj);
     }
 }
 
diff --git a/src/gromacs/mdlib/mdoutf.h b/src/gromacs/mdlib/mdoutf.h
index 6461f9c747..308f08cecc 100644
--- a/src/gromacs/mdlib/mdoutf.h
+++ b/src/gromacs/mdlib/mdoutf.h
@@ -79,7 +79,7 @@ gmx_mdoutf_t init_mdoutf(FILE*                          fplog,
                          const t_inputrec*              ir,
                          const gmx_mtop_t&              mtop,
                          const gmx_output_env_t*        oenv,
-                         gmx_wallcycle_t                wcycle,
+                         gmx_wallcycle*                 wcycle,
                          gmx::StartingBehavior          startingBehavior,
                          bool                           simulationsShareState,
                          const gmx_multisim_t*          ms);
@@ -91,7 +91,7 @@ ener_file_t mdoutf_get_fp_ene(gmx_mdoutf_t of);
 FILE* mdoutf_get_fp_dhdl(gmx_mdoutf_t of);
 
 /*! \brief Getter for wallcycle timer */
-gmx_wallcycle_t mdoutf_get_wcycle(gmx_mdoutf_t of);
+gmx_wallcycle* mdoutf_get_wcycle(gmx_mdoutf_t of);
 
 /*! \brief Close TNG files if they are open.
  *
diff --git a/src/gromacs/mdlib/resethandler.cpp b/src/gromacs/mdlib/resethandler.cpp
index 7e807061ac..a471a6c828 100644
--- a/src/gromacs/mdlib/resethandler.cpp
+++ b/src/gromacs/mdlib/resethandler.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -80,7 +80,7 @@ ResetHandler::ResetHandler(compat::not_null<SimulationSignal*> signal,
                            bool                                resetHalfway,
                            real                                maximumHoursToRun,
                            const MDLogger&                     mdlog,
-                           gmx_wallcycle_t                     wcycle,
+                           gmx_wallcycle*                      wcycle,
                            gmx_walltime_accounting_t           walltime_accounting) :
     signal_(*signal),
     rankCanSetSignal_(false),
@@ -144,7 +144,7 @@ bool ResetHandler::resetCountersImpl(int64_t                     step,
                                      t_nrnb*                     nrnb,
                                      const gmx_pme_t*            pme,
                                      const pme_load_balancing_t* pme_loadbal,
-                                     gmx_wallcycle_t             wcycle,
+                                     gmx_wallcycle*              wcycle,
                                      gmx_walltime_accounting_t   walltime_accounting)
 {
     /* Reset either if signal has been passed, or if reset step has been reached */
@@ -194,14 +194,14 @@ bool ResetHandler::resetCountersImpl(int64_t                     step,
             resetGpuProfiler();
         }
 
-        wallcycle_stop(wcycle, ewcRUN);
+        wallcycle_stop(wcycle, WallCycleCounter::Run);
         wallcycle_reset_all(wcycle);
         if (DOMAINDECOMP(cr))
         {
             reset_dd_statistics_counters(cr->dd);
         }
         clear_nrnb(nrnb);
-        wallcycle_start(wcycle, ewcRUN);
+        wallcycle_start(wcycle, WallCycleCounter::Run);
         walltime_accounting_reset_time(walltime_accounting, step);
         print_date_and_time(fplog, cr->nodeid, "Restarted time", gmx_gettime());
 
diff --git a/src/gromacs/mdlib/sim_util.cpp b/src/gromacs/mdlib/sim_util.cpp
index 3da4bc541c..105245ebfd 100644
--- a/src/gromacs/mdlib/sim_util.cpp
+++ b/src/gromacs/mdlib/sim_util.cpp
@@ -193,7 +193,7 @@ static void pull_potential_wrapper(const t_commrec*               cr,
                                    pull_t*                        pull_work,
                                    const real*                    lambda,
                                    double                         t,
-                                   gmx_wallcycle_t                wcycle)
+                                   gmx_wallcycle*                 wcycle)
 {
     t_pbc pbc;
     real  dvdl;
@@ -201,7 +201,7 @@ static void pull_potential_wrapper(const t_commrec*               cr,
     /* Calculate the center of mass forces, this requires communication,
      * which is why pull_potential is called close to other communication.
      */
-    wallcycle_start(wcycle, ewcPULLPOT);
+    wallcycle_start(wcycle, WallCycleCounter::PullPot);
     set_pbc(&pbc, ir.pbcType, box);
     dvdl = 0;
     enerd->term[F_COM_PULL] +=
@@ -215,7 +215,7 @@ static void pull_potential_wrapper(const t_commrec*               cr,
                            force,
                            &dvdl);
     enerd->dvdl_lin[FreeEnergyPerturbationCouplingType::Restraint] += dvdl;
-    wallcycle_stop(wcycle, ewcPULLPOT);
+    wallcycle_stop(wcycle, WallCycleCounter::PullPot);
 }
 
 static void pme_receive_force_ener(t_forcerec*           fr,
@@ -224,18 +224,18 @@ static void pme_receive_force_ener(t_forcerec*           fr,
                                    gmx_enerdata_t*       enerd,
                                    bool                  useGpuPmePpComms,
                                    bool                  receivePmeForceToGpu,
-                                   gmx_wallcycle_t       wcycle)
+                                   gmx_wallcycle*        wcycle)
 {
     real  e_q, e_lj, dvdl_q, dvdl_lj;
     float cycles_ppdpme, cycles_seppme;
 
-    cycles_ppdpme = wallcycle_stop(wcycle, ewcPPDURINGPME);
+    cycles_ppdpme = wallcycle_stop(wcycle, WallCycleCounter::PpDuringPme);
     dd_cycles_add(cr->dd, cycles_ppdpme, ddCyclPPduringPME);
 
     /* In case of node-splitting, the PP nodes receive the long-range
      * forces, virial and energy from the PME nodes here.
      */
-    wallcycle_start(wcycle, ewcPP_PMEWAITRECVF);
+    wallcycle_start(wcycle, WallCycleCounter::PpPmeWaitRecvF);
     dvdl_q  = 0;
     dvdl_lj = 0;
     gmx_pme_receive_f(fr->pmePpCommGpu.get(),
@@ -257,7 +257,7 @@ static void pme_receive_force_ener(t_forcerec*           fr,
     {
         dd_cycles_add(cr->dd, cycles_seppme, ddCyclPME);
     }
-    wallcycle_stop(wcycle, ewcPP_PMEWAITRECVF);
+    wallcycle_stop(wcycle, WallCycleCounter::PpPmeWaitRecvF);
 }
 
 static void print_large_forces(FILE*                fp,
@@ -302,7 +302,7 @@ static void print_large_forces(FILE*                fp,
 
 //! When necessary, spreads forces on vsites and computes the virial for \p forceOutputs->forceWithShiftForces()
 static void postProcessForceWithShiftForces(t_nrnb*                   nrnb,
-                                            gmx_wallcycle_t           wcycle,
+                                            gmx_wallcycle*            wcycle,
                                             const matrix              box,
                                             ArrayRef<const RVec>      x,
                                             ForceOutputs*             forceOutputs,
@@ -342,7 +342,7 @@ static void postProcessForceWithShiftForces(t_nrnb*                   nrnb,
 static void postProcessForces(const t_commrec*          cr,
                               int64_t                   step,
                               t_nrnb*                   nrnb,
-                              gmx_wallcycle_t           wcycle,
+                              gmx_wallcycle*            wcycle,
                               const matrix              box,
                               ArrayRef<const RVec>      x,
                               ForceOutputs*             forceOutputs,
@@ -417,7 +417,7 @@ static void do_nb_verlet(t_forcerec*                fr,
                          const int                  clearF,
                          const int64_t              step,
                          t_nrnb*                    nrnb,
-                         gmx_wallcycle_t            wcycle)
+                         gmx_wallcycle*             wcycle)
 {
     if (!stepWork.computeNonbondedForces)
     {
@@ -438,9 +438,9 @@ static void do_nb_verlet(t_forcerec*                fr,
             /* Prune the pair-list beyond fr->ic->rlistPrune using
              * the current coordinates of the atoms.
              */
-            wallcycle_sub_start(wcycle, ewcsNONBONDED_PRUNING);
+            wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedPruning);
             nbv->dispatchPruneKernelCpu(ilocality, fr->shift_vec);
-            wallcycle_sub_stop(wcycle, ewcsNONBONDED_PRUNING);
+            wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedPruning);
         }
     }
 
@@ -624,7 +624,7 @@ static void computeSpecialForces(FILE*                          fplog,
                                  pull_t*                        pull_work,
                                  int64_t                        step,
                                  double                         t,
-                                 gmx_wallcycle_t                wcycle,
+                                 gmx_wallcycle*                 wcycle,
                                  gmx::ForceProviders*           forceProviders,
                                  const matrix                   box,
                                  gmx::ArrayRef<const gmx::RVec> x,
@@ -697,10 +697,10 @@ static void computeSpecialForces(FILE*                          fplog,
     /* Add the forces from enforced rotation potentials (if any) */
     if (inputrec.bRot)
     {
-        wallcycle_start(wcycle, ewcROTadd);
+        wallcycle_start(wcycle, WallCycleCounter::RotAdd);
         enerd->term[F_COM_PULL] +=
                 add_rot_forces(enforcedRotation, forceWithVirialMtsLevel0->force_, cr, step, t);
-        wallcycle_stop(wcycle, ewcROTadd);
+        wallcycle_stop(wcycle, WallCycleCounter::RotAdd);
     }
 
     if (ed)
@@ -734,7 +734,7 @@ static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
                                       const StepWorkload&   stepWork,
                                       GpuEventSynchronizer* xReadyOnDevice,
                                       const real            lambdaQ,
-                                      gmx_wallcycle_t       wcycle)
+                                      gmx_wallcycle*        wcycle)
 {
     pme_gpu_prepare_computation(pmedata, box, wcycle, stepWork);
     pme_gpu_launch_spread(pmedata, xReadyOnDevice, wcycle, lambdaQ);
@@ -751,7 +751,7 @@ static inline void launchPmeGpuSpread(gmx_pme_t*            pmedata,
  */
 static void launchPmeGpuFftAndGather(gmx_pme_t*               pmedata,
                                      const real               lambdaQ,
-                                     gmx_wallcycle_t          wcycle,
+                                     gmx_wallcycle*           wcycle,
                                      const gmx::StepWorkload& stepWork)
 {
     pme_gpu_launch_complex_transforms(pmedata, wcycle, stepWork);
@@ -783,7 +783,7 @@ static void alternatePmeNbGpuWaitReduce(nonbonded_verlet_t* nbv,
                                         gmx_enerdata_t*     enerd,
                                         const real          lambdaQ,
                                         const StepWorkload& stepWork,
-                                        gmx_wallcycle_t     wcycle)
+                                        gmx_wallcycle*      wcycle)
 {
     bool isPmeGpuDone = false;
     bool isNbGpuDone  = false;
@@ -839,9 +839,9 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceH
                                       const DomainLifetimeWorkload&       domainWork,
                                       const StepWorkload&                 stepWork,
                                       const bool                          havePpDomainDecomposition,
-                                      gmx_wallcycle_t                     wcycle)
+                                      gmx_wallcycle*                      wcycle)
 {
-    wallcycle_sub_start(wcycle, ewcsCLEAR_FORCE_BUFFER);
+    wallcycle_sub_start(wcycle, WallCycleSubCounter::ClearForceBuffer);
 
     /* NOTE: We assume fr->shiftForces is all zeros here */
     gmx::ForceWithShiftForces forceWithShiftForces(
@@ -882,7 +882,7 @@ static ForceOutputs setupForceOutputs(ForceHelperBuffers*                 forceH
         clearRVecs(forceWithVirial.force_, true);
     }
 
-    wallcycle_sub_stop(wcycle, ewcsCLEAR_FORCE_BUFFER);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::ClearForceBuffer);
 
     return ForceOutputs(
             forceWithShiftForces, forceHelperBuffers->haveDirectVirialContributions(), forceWithVirial);
@@ -992,7 +992,7 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
                                     const gmx::MdrunScheduleWorkload& runScheduleWork,
                                     bool                              useGpuPmeOnThisRank,
                                     int64_t                           step,
-                                    gmx_wallcycle_t                   wcycle)
+                                    gmx_wallcycle*                    wcycle)
 {
     if (runScheduleWork.simulationWork.useGpuNonbonded && runScheduleWork.stepWork.computeNonbondedForces)
     {
@@ -1006,11 +1006,11 @@ static void launchGpuEndOfStepTasks(nonbonded_verlet_t*               nbv,
         }
 
         /* now clear the GPU outputs while we finish the step on the CPU */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+        wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
         Nbnxm::gpu_clear_outputs(nbv->gpu_nbv, runScheduleWork.stepWork.computeVirial);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+        wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
     }
 
     if (useGpuPmeOnThisRank)
@@ -1195,7 +1195,7 @@ void do_force(FILE*                               fplog,
               pull_t*                             pull_work,
               int64_t                             step,
               t_nrnb*                             nrnb,
-              gmx_wallcycle_t                     wcycle,
+              gmx_wallcycle*                      wcycle,
               const gmx_localtop_t*               top,
               const matrix                        box,
               gmx::ArrayRefWithPadding<gmx::RVec> x,
@@ -1374,12 +1374,12 @@ void do_force(FILE*                               fplog,
             fr->wholeMoleculeTransform->updateForAtomPbcJumps(x.unpaddedArrayRef(), box);
         }
 
-        wallcycle_start(wcycle, ewcNS);
+        wallcycle_start(wcycle, WallCycleCounter::NS);
         if (!DOMAINDECOMP(cr))
         {
             const rvec vzero       = { 0.0_real, 0.0_real, 0.0_real };
             const rvec boxDiagonal = { box[XX][XX], box[YY][YY], box[ZZ][ZZ] };
-            wallcycle_sub_start(wcycle, ewcsNBS_GRID_LOCAL);
+            wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridLocal);
             nbnxn_put_on_grid(nbv,
                               box,
                               0,
@@ -1392,30 +1392,30 @@ void do_force(FILE*                               fplog,
                               x.unpaddedArrayRef(),
                               0,
                               nullptr);
-            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_LOCAL);
+            wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridLocal);
         }
         else
         {
-            wallcycle_sub_start(wcycle, ewcsNBS_GRID_NONLOCAL);
+            wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSGridNonLocal);
             nbnxn_put_on_grid_nonlocal(nbv, domdec_zones(cr->dd), fr->cginfo, x.unpaddedArrayRef());
-            wallcycle_sub_stop(wcycle, ewcsNBS_GRID_NONLOCAL);
+            wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSGridNonLocal);
         }
 
         nbv->setAtomProperties(gmx::constArrayRefFromArray(mdatoms->typeA, mdatoms->nr),
                                gmx::constArrayRefFromArray(mdatoms->chargeA, mdatoms->nr),
                                fr->cginfo);
 
-        wallcycle_stop(wcycle, ewcNS);
+        wallcycle_stop(wcycle, WallCycleCounter::NS);
 
         /* initialize the GPU nbnxm atom data and bonded data structures */
         if (simulationWork.useGpuNonbonded)
         {
             // Note: cycle counting only nononbondeds, gpuBonded counts internally
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-            wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+            wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
             Nbnxm::gpu_init_atomdata(nbv->gpu_nbv, nbv->nbat.get());
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+            wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+            wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
 
             if (fr->gpuBonded)
             {
@@ -1440,15 +1440,15 @@ void do_force(FILE*                               fplog,
         runScheduleWork->domainWork = setupDomainLifetimeWorkload(
                 inputrec, *fr, pull_work, ed, *mdatoms, simulationWork, stepWork);
 
-        wallcycle_start_nocount(wcycle, ewcNS);
-        wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_LOCAL);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
+        wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchLocal);
         /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
         nbv->constructPairlist(InteractionLocality::Local, top->excls, step, nrnb);
 
         nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::Local);
 
-        wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_LOCAL);
-        wallcycle_stop(wcycle, ewcNS);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchLocal);
+        wallcycle_stop(wcycle, WallCycleCounter::NS);
 
         if (stepWork.useGpuXBufferOps)
         {
@@ -1484,15 +1484,15 @@ void do_force(FILE*                               fplog,
     {
         ddBalanceRegionHandler.openBeforeForceComputationGpu();
 
-        wallcycle_start(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
+        wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
         Nbnxm::gpu_upload_shiftvec(nbv->gpu_nbv, nbv->nbat.get());
         if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
         {
             Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::Local);
         }
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+        wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
         // with X buffer ops offloaded to the GPU on all but the search steps
 
         // bonded work not split into separate local and non-local, so with DD
@@ -1503,11 +1503,11 @@ void do_force(FILE*                               fplog,
         }
 
         /* launch local nonbonded work on GPU */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+        wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
         do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::Local, enbvClearFNo, step, nrnb, wcycle);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+        wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
     }
 
     if (useGpuPmeOnThisRank)
@@ -1529,14 +1529,14 @@ void do_force(FILE*                               fplog,
         if (stepWork.doNeighborSearch)
         {
             // TODO: fuse this branch with the above large stepWork.doNeighborSearch block
-            wallcycle_start_nocount(wcycle, ewcNS);
-            wallcycle_sub_start(wcycle, ewcsNBS_SEARCH_NONLOCAL);
+            wallcycle_start_nocount(wcycle, WallCycleCounter::NS);
+            wallcycle_sub_start(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
             /* Note that with a GPU the launch overhead of the list transfer is not timed separately */
             nbv->constructPairlist(InteractionLocality::NonLocal, top->excls, step, nrnb);
 
             nbv->setupGpuShortRangeWork(fr->gpuBonded, InteractionLocality::NonLocal);
-            wallcycle_sub_stop(wcycle, ewcsNBS_SEARCH_NONLOCAL);
-            wallcycle_stop(wcycle, ewcNS);
+            wallcycle_sub_stop(wcycle, WallCycleSubCounter::NBSSearchNonLocal);
+            wallcycle_stop(wcycle, WallCycleCounter::NS);
             // TODO refactor this GPU halo exchange re-initialisation
             // to location in do_md where GPU halo exchange is
             // constructed at partitioning, after above stateGpu
@@ -1593,11 +1593,11 @@ void do_force(FILE*                               fplog,
 
             if (stepWork.doNeighborSearch || !stepWork.useGpuXBufferOps)
             {
-                wallcycle_start(wcycle, ewcLAUNCH_GPU);
-                wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+                wallcycle_start(wcycle, WallCycleCounter::LaunchGpu);
+                wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
                 Nbnxm::gpu_copy_xq_to_gpu(nbv->gpu_nbv, nbv->nbat.get(), AtomLocality::NonLocal);
-                wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-                wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+                wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+                wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
             }
 
             if (domainWork.haveGpuBondedWork)
@@ -1606,32 +1606,32 @@ void do_force(FILE*                               fplog,
             }
 
             /* launch non-local nonbonded tasks on GPU */
-            wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-            wallcycle_sub_start(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+            wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+            wallcycle_sub_start(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
             do_nb_verlet(fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFNo, step, nrnb, wcycle);
-            wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
-            wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+            wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
+            wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
         }
     }
 
     if (simulationWork.useGpuNonbonded && stepWork.computeNonbondedForces)
     {
         /* launch D2H copy-back F */
-        wallcycle_start_nocount(wcycle, ewcLAUNCH_GPU);
-        wallcycle_sub_start_nocount(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::LaunchGpu);
+        wallcycle_sub_start_nocount(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
 
         if (havePPDomainDecomposition(cr))
         {
             Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::NonLocal);
         }
         Nbnxm::gpu_launch_cpyback(nbv->gpu_nbv, nbv->nbat.get(), stepWork, AtomLocality::Local);
-        wallcycle_sub_stop(wcycle, ewcsLAUNCH_GPU_NONBONDED);
+        wallcycle_sub_stop(wcycle, WallCycleSubCounter::LaunchGpuNonBonded);
 
         if (domainWork.haveGpuBondedWork && stepWork.computeEnergy)
         {
             fr->gpuBonded->launchEnergyTransfer();
         }
-        wallcycle_stop(wcycle, ewcLAUNCH_GPU);
+        wallcycle_stop(wcycle, WallCycleCounter::LaunchGpu);
     }
 
     gmx::ArrayRef<const gmx::RVec> xWholeMolecules;
@@ -1676,7 +1676,7 @@ void do_force(FILE*                               fplog,
 
     if (DOMAINDECOMP(cr) && !thisRankHasDuty(cr, DUTY_PME))
     {
-        wallcycle_start(wcycle, ewcPPDURINGPME);
+        wallcycle_start(wcycle, WallCycleCounter::PpDuringPme);
         dd_force_flop_start(cr->dd, nrnb);
     }
 
@@ -1691,15 +1691,15 @@ void do_force(FILE*                               fplog,
 
     if (inputrec.bRot)
     {
-        wallcycle_start(wcycle, ewcROT);
+        wallcycle_start(wcycle, WallCycleCounter::Rot);
         do_rotation(cr, enforcedRotation, box, x.unpaddedConstArrayRef(), t, step, stepWork.doNeighborSearch);
-        wallcycle_stop(wcycle, ewcROT);
+        wallcycle_stop(wcycle, WallCycleCounter::Rot);
     }
 
     /* Start the force cycle counter.
      * Note that a different counter is used for dynamic load balancing.
      */
-    wallcycle_start(wcycle, ewcFORCE);
+    wallcycle_start(wcycle, WallCycleCounter::Force);
 
     /* Set up and clear force outputs:
      * forceOutMtsLevel0:  everything except what is in the other two outputs
@@ -1799,10 +1799,10 @@ void do_force(FILE*                               fplog,
              * This can be split into a local and a non-local part when overlapping
              * communication with calculation with domain decomposition.
              */
-            wallcycle_stop(wcycle, ewcFORCE);
+            wallcycle_stop(wcycle, WallCycleCounter::Force);
             nbv->atomdata_add_nbat_f_to_f(AtomLocality::All,
                                           forceOutNonbonded->forceWithShiftForces().force());
-            wallcycle_start_nocount(wcycle, ewcFORCE);
+            wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
         }
 
         /* If there are multiple fshift output buffers we need to reduce them */
@@ -1818,10 +1818,10 @@ void do_force(FILE*                               fplog,
     // TODO Force flags should include haveFreeEnergyWork for this domain
     if (stepWork.useGpuXHalo && (domainWork.haveCpuBondedWork || domainWork.haveFreeEnergyWork))
     {
-        wallcycle_stop(wcycle, ewcFORCE);
+        wallcycle_stop(wcycle, WallCycleCounter::Force);
         /* Wait for non-local coordinate data to be copied from device */
         stateGpu->waitCoordinatesReadyOnHost(AtomLocality::NonLocal);
-        wallcycle_start_nocount(wcycle, ewcFORCE);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
     }
 
     // Compute wall interactions, when present.
@@ -1906,7 +1906,7 @@ void do_force(FILE*                               fplog,
                                      ddBalanceRegionHandler);
     }
 
-    wallcycle_stop(wcycle, ewcFORCE);
+    wallcycle_stop(wcycle, WallCycleCounter::Force);
 
     // VdW dispersion correction, only computed on master rank to avoid double counting
     if ((stepWork.computeEnergy || stepWork.computeVirial) && fr->dispersionCorrection && MASTER(cr))
@@ -1983,10 +1983,10 @@ void do_force(FILE*                               fplog,
             }
             else
             {
-                wallcycle_start_nocount(wcycle, ewcFORCE);
+                wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
                 do_nb_verlet(
                         fr, ic, enerd, stepWork, InteractionLocality::NonLocal, enbvClearFYes, step, nrnb, wcycle);
-                wallcycle_stop(wcycle, ewcFORCE);
+                wallcycle_stop(wcycle, WallCycleCounter::Force);
             }
 
             if (stepWork.useGpuFBufferOps)
@@ -2152,7 +2152,7 @@ void do_force(FILE*                               fplog,
     {
         // NOTE: emulation kernel is not included in the balancing region,
         // but emulation mode does not target performance anyway
-        wallcycle_start_nocount(wcycle, ewcFORCE);
+        wallcycle_start_nocount(wcycle, WallCycleCounter::Force);
         do_nb_verlet(fr,
                      ic,
                      enerd,
@@ -2162,7 +2162,7 @@ void do_force(FILE*                               fplog,
                      step,
                      nrnb,
                      wcycle);
-        wallcycle_stop(wcycle, ewcFORCE);
+        wallcycle_stop(wcycle, WallCycleCounter::Force);
     }
 
     // If on GPU PME-PP comms path, receive forces from PME before GPU buffer ops
diff --git a/src/gromacs/mdlib/trajectory_writing.cpp b/src/gromacs/mdlib/trajectory_writing.cpp
index a7787ea97d..53f1e7ff6d 100644
--- a/src/gromacs/mdlib/trajectory_writing.cpp
+++ b/src/gromacs/mdlib/trajectory_writing.cpp
@@ -121,7 +121,7 @@ void do_md_trajectory_writing(FILE*                          fplog,
 
     if (mdof_flags != 0)
     {
-        wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ);
+        wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
         if (bCPT)
         {
             if (MASTER(cr))
@@ -191,7 +191,7 @@ void do_md_trajectory_writing(FILE*                          fplog,
                 sfree(x_for_confout);
             }
         }
-        wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ);
+        wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
     }
 #if GMX_FAHCORE
     if (MASTER(cr))
diff --git a/src/gromacs/mdlib/update.cpp b/src/gromacs/mdlib/update.cpp
index 14f03826ed..a2876a3123 100644
--- a/src/gromacs/mdlib/update.cpp
+++ b/src/gromacs/mdlib/update.cpp
@@ -139,7 +139,7 @@ public:
     void finish_update(const t_inputrec& inputRecord,
                        const t_mdatoms*  md,
                        t_state*          state,
-                       gmx_wallcycle_t   wcycle,
+                       gmx_wallcycle*    wcycle,
                        bool              haveConstraints);
 
     void update_sd_second_half(const t_inputrec&                 inputRecord,
@@ -151,7 +151,7 @@ public:
                                t_state*                          state,
                                const t_commrec*                  cr,
                                t_nrnb*                           nrnb,
-                               gmx_wallcycle_t                   wcycle,
+                               gmx_wallcycle*                    wcycle,
                                gmx::Constraints*                 constr,
                                bool                              do_log,
                                bool                              do_ene);
@@ -246,7 +246,7 @@ void Update::update_coords(const t_inputrec&                                inpu
 void Update::finish_update(const t_inputrec& inputRecord,
                            const t_mdatoms*  md,
                            t_state*          state,
-                           gmx_wallcycle_t   wcycle,
+                           gmx_wallcycle*    wcycle,
                            const bool        haveConstraints)
 {
     return impl_->finish_update(inputRecord, md, state, wcycle, haveConstraints);
@@ -259,7 +259,7 @@ void Update::update_sd_second_half(const t_inputrec& inputRecord,
                                    t_state*          state,
                                    const t_commrec*  cr,
                                    t_nrnb*           nrnb,
-                                   gmx_wallcycle_t   wcycle,
+                                   gmx_wallcycle*    wcycle,
                                    gmx::Constraints* constr,
                                    bool              do_log,
                                    bool              do_ene)
@@ -1394,7 +1394,7 @@ void Update::Impl::update_sd_second_half(const t_inputrec&                 input
                                          t_state*                          state,
                                          const t_commrec*                  cr,
                                          t_nrnb*                           nrnb,
-                                         gmx_wallcycle_t                   wcycle,
+                                         gmx_wallcycle*                    wcycle,
                                          gmx::Constraints*                 constr,
                                          bool                              do_log,
                                          bool                              do_ene)
@@ -1415,7 +1415,7 @@ void Update::Impl::update_sd_second_half(const t_inputrec&                 input
          */
         real dt = inputRecord.delta_t;
 
-        wallcycle_start(wcycle, ewcUPDATE);
+        wallcycle_start(wcycle, WallCycleCounter::Update);
 
         int nth = gmx_omp_nthreads_get(emntUpdate);
 
@@ -1448,7 +1448,7 @@ void Update::Impl::update_sd_second_half(const t_inputrec&                 input
             GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
         }
         inc_nrnb(nrnb, eNR_UPDATE, homenr);
-        wallcycle_stop(wcycle, ewcUPDATE);
+        wallcycle_stop(wcycle, WallCycleCounter::Update);
 
         /* Constrain the coordinates upd->xp for half a time step */
         bool computeVirial = false;
@@ -1473,14 +1473,14 @@ void Update::Impl::update_sd_second_half(const t_inputrec&                 input
 void Update::Impl::finish_update(const t_inputrec& inputRecord,
                                  const t_mdatoms*  md,
                                  t_state*          state,
-                                 gmx_wallcycle_t   wcycle,
+                                 gmx_wallcycle*    wcycle,
                                  const bool        haveConstraints)
 {
     /* NOTE: Currently we always integrate to a temporary buffer and
      * then copy the results back here.
      */
 
-    wallcycle_start_nocount(wcycle, ewcUPDATE);
+    wallcycle_start_nocount(wcycle, WallCycleCounter::Update);
 
     const int homenr = md->homenr;
     auto      xp     = makeConstArrayRef(xp_).subArray(0, homenr);
@@ -1521,7 +1521,7 @@ void Update::Impl::finish_update(const t_inputrec& inputRecord,
         }
     }
 
-    wallcycle_stop(wcycle, ewcUPDATE);
+    wallcycle_stop(wcycle, WallCycleCounter::Update);
 }
 
 void Update::Impl::update_coords(const t_inputrec&                 inputRecord,
diff --git a/src/gromacs/mdlib/update.h b/src/gromacs/mdlib/update.h
index 10a93faa51..5992a69e45 100644
--- a/src/gromacs/mdlib/update.h
+++ b/src/gromacs/mdlib/update.h
@@ -144,7 +144,7 @@ public:
     void finish_update(const t_inputrec& inputRecord,
                        const t_mdatoms*  md,
                        t_state*          state,
-                       gmx_wallcycle_t   wcycle,
+                       gmx_wallcycle*    wcycle,
                        bool              haveConstraints);
 
     /*! \brief Secong part of the SD integrator.
@@ -172,7 +172,7 @@ public:
                                t_state*          state,
                                const t_commrec*  cr,
                                t_nrnb*           nrnb,
-                               gmx_wallcycle_t   wcycle,
+                               gmx_wallcycle*    wcycle,
                                gmx::Constraints* constr,
                                bool              do_log,
                                bool              do_ene);
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index 5fc4fb8609..5a950d34e5 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -109,8 +109,8 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
                                          const float                       dtPressureCouple,
                                          const matrix                      prVelocityScalingMatrix)
 {
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
 
     // Clearing virial matrix
     // TODO There is no point in having separate virial matrix for constraints
@@ -141,16 +141,16 @@ void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fRead
 
     coordinatesReady_->markEvent(deviceStream_);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 
     return;
 }
 
 void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
 {
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
 
     ScalingMatrix mu(scalingMatrix);
 
@@ -166,14 +166,14 @@ void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
     //       can affect the performance if nstpcouple is small.
     deviceStream_.synchronize();
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix)
 {
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
 
     ScalingMatrix mu(scalingMatrix);
 
@@ -189,8 +189,8 @@ void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix)
     //       can affect the performance if nstpcouple is small.
     deviceStream_.synchronize();
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
@@ -227,8 +227,8 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer<Float3>          d_x,
                                    const t_mdatoms&              md)
 {
     // TODO wallcycle
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
 
     GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
     GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
@@ -253,8 +253,8 @@ void UpdateConstrainGpu::Impl::set(DeviceBuffer<Float3>          d_x,
     coordinateScalingKernelLaunchConfig_.gridSize[0] =
             (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_UPDATE_CONSTRAIN);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box)
diff --git a/src/gromacs/mdlib/update_vv.cpp b/src/gromacs/mdlib/update_vv.cpp
index 76334de0c5..0db7c24791 100644
--- a/src/gromacs/mdlib/update_vv.cpp
+++ b/src/gromacs/mdlib/update_vv.cpp
@@ -119,7 +119,7 @@ void integrateVVFirstStep(int64_t                                  step,
         /*  ############### START FIRST UPDATE HALF-STEP FOR VV METHODS############### */
         rvec* vbuf = nullptr;
 
-        wallcycle_start(wcycle, ewcUPDATE);
+        wallcycle_start(wcycle, WallCycleCounter::Update);
         if (ir->eI == IntegrationAlgorithm::VV && bInitStep)
         {
             /* if using velocity verlet with full time step Ekin,
@@ -140,9 +140,9 @@ void integrateVVFirstStep(int64_t                                  step,
         upd->update_coords(
                 *ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtVELOCITY1, cr, constr != nullptr);
 
-        wallcycle_stop(wcycle, ewcUPDATE);
+        wallcycle_stop(wcycle, WallCycleCounter::Update);
         constrain_velocities(constr, do_log, do_ene, step, state, nullptr, bCalcVir, shake_vir);
-        wallcycle_start(wcycle, ewcUPDATE);
+        wallcycle_start(wcycle, WallCycleCounter::Update);
         /* if VV, compute the pressure and constraints */
         /* For VV2, we strictly only need this if using pressure
          * control, but we really would like to have accurate pressures
@@ -163,7 +163,7 @@ void integrateVVFirstStep(int64_t                                  step,
             So we need information from the last step in the first half of the integration */
         if (bGStat || do_per_step(step - 1, nstglobalcomm))
         {
-            wallcycle_stop(wcycle, ewcUPDATE);
+            wallcycle_stop(wcycle, WallCycleCounter::Update);
             int cglo_flags =
                     ((bGStat ? CGLO_GSTAT : 0) | (bCalcEner ? CGLO_ENERGY : 0)
                      | (bTemp ? CGLO_TEMPERATURE : 0) | (bPres ? CGLO_PRESSURE : 0)
@@ -212,7 +212,7 @@ void integrateVVFirstStep(int64_t                                  step,
                         fplog, vcm, *mdatoms, makeArrayRef(state->x), makeArrayRef(state->v));
                 inc_nrnb(nrnb, eNR_STOPCM, mdatoms->homenr);
             }
-            wallcycle_start(wcycle, ewcUPDATE);
+            wallcycle_start(wcycle, WallCycleCounter::Update);
         }
         /* temperature scaling and pressure scaling to produce the extended variables at t+dt */
         if (!bInitStep)
@@ -241,7 +241,7 @@ void integrateVVFirstStep(int64_t                                  step,
             }
             else if (bExchanged)
             {
-                wallcycle_stop(wcycle, ewcUPDATE);
+                wallcycle_stop(wcycle, WallCycleCounter::Update);
                 /* We need the kinetic energy at minus the half step for determining
                  * the full step kinetic energy and possibly for T-coupling.*/
                 /* This may not be quite working correctly yet . . . . */
@@ -267,7 +267,7 @@ void integrateVVFirstStep(int64_t                                  step,
                                 state->box,
                                 bSumEkinhOld,
                                 CGLO_GSTAT | CGLO_TEMPERATURE);
-                wallcycle_start(wcycle, ewcUPDATE);
+                wallcycle_start(wcycle, WallCycleCounter::Update);
             }
         }
         /* if it's the initial step, we performed this first step just to get the constraint virial */
@@ -276,7 +276,7 @@ void integrateVVFirstStep(int64_t                                  step,
             copy_rvecn(vbuf, state->v.rvec_array(), 0, state->natoms);
             sfree(vbuf);
         }
-        wallcycle_stop(wcycle, ewcUPDATE);
+        wallcycle_stop(wcycle, WallCycleCounter::Update);
     }
 
     /* compute the conserved quantity */
@@ -355,7 +355,7 @@ void integrateVVSecondStep(int64_t                                  step,
     upd->update_coords(
             *ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
 
-    wallcycle_stop(wcycle, ewcUPDATE);
+    wallcycle_stop(wcycle, WallCycleCounter::Update);
 
     constrain_coordinates(
             constr, do_log, do_ene, step, state, upd->xp()->arrayRefWithPadding(), dvdl_constr, bCalcVir, shake_vir);
@@ -390,14 +390,14 @@ void integrateVVSecondStep(int64_t                                  step,
                         lastbox,
                         bSumEkinhOld,
                         (bGStat ? CGLO_GSTAT : 0) | CGLO_TEMPERATURE);
-        wallcycle_start(wcycle, ewcUPDATE);
+        wallcycle_start(wcycle, WallCycleCounter::Update);
         trotter_update(ir, step, ekind, enerd, state, total_vir, mdatoms, MassQ, trotter_seq, ettTSEQ4);
         /* now we know the scaling, we can compute the positions again */
         std::copy(cbuf->begin(), cbuf->end(), state->x.begin());
 
         upd->update_coords(
                 *ir, step, mdatoms, state, f->view().forceWithPadding(), fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
-        wallcycle_stop(wcycle, ewcUPDATE);
+        wallcycle_stop(wcycle, WallCycleCounter::Update);
 
         /* do we need an extra constraint here? just need to copy out of as_rvec_array(state->v.data()) to upd->xp? */
         /* are the small terms in the shake_vir here due
diff --git a/src/gromacs/mdlib/vsite.cpp b/src/gromacs/mdlib/vsite.cpp
index 6dd3c6f104..9be178c243 100644
--- a/src/gromacs/mdlib/vsite.cpp
+++ b/src/gromacs/mdlib/vsite.cpp
@@ -2283,7 +2283,7 @@ void VirtualSitesHandler::Impl::spreadForces(ArrayRef<const RVec> x,
                                              const matrix         box,
                                              gmx_wallcycle*       wcycle)
 {
-    wallcycle_start(wcycle, ewcVSITESPREAD);
+    wallcycle_start(wcycle, WallCycleCounter::VsiteSpread);
 
     const bool useDomdec = domainInfo_.useDomdec();
 
@@ -2477,7 +2477,7 @@ void VirtualSitesHandler::Impl::spreadForces(ArrayRef<const RVec> x,
     inc_nrnb(nrnb, eNR_VSITE4FDN, vsite_count(ilists_, F_VSITE4FDN));
     inc_nrnb(nrnb, eNR_VSITEN, vsite_count(ilists_, F_VSITEN));
 
-    wallcycle_stop(wcycle, ewcVSITESPREAD);
+    wallcycle_stop(wcycle, WallCycleCounter::VsiteSpread);
 }
 
 /*! \brief Returns the an array with group indices for each atom
diff --git a/src/gromacs/mdrun/md.cpp b/src/gromacs/mdrun/md.cpp
index e398e7693e..ecb0778652 100644
--- a/src/gromacs/mdrun/md.cpp
+++ b/src/gromacs/mdrun/md.cpp
@@ -796,7 +796,7 @@ void gmx::LegacySimulator::do_md()
     }
 
     walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
+    wallcycle_start(wcycle, WallCycleCounter::Run);
     print_start(fplog, cr, walltime_accounting, "mdrun");
 
     /***********************************************************
@@ -890,7 +890,7 @@ void gmx::LegacySimulator::do_md()
                            simulationWork.useGpuPmePpCommunication);
         }
 
-        wallcycle_start(wcycle, ewcSTEP);
+        wallcycle_start(wcycle, WallCycleCounter::Step);
 
         bLastStep = (step_rel == ir->nsteps);
         t         = t0 + step * ir->delta_t;
@@ -962,7 +962,7 @@ void gmx::LegacySimulator::do_md()
         if (vsite != nullptr)
         {
             // Virtual sites need to be updated before domain decomposition and forces are calculated
-            wallcycle_start(wcycle, ewcVSITECONSTR);
+            wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
             // md-vv calculates virtual velocities once it has full-step real velocities
             vsite->construct(state->x,
                              state->v,
@@ -970,7 +970,7 @@ void gmx::LegacySimulator::do_md()
                              (!EI_VV(inputrec->eI) && needVirtualVelocitiesThisStep)
                                      ? VSiteOperation::PositionsAndVelocities
                                      : VSiteOperation::Positions);
-            wallcycle_stop(wcycle, ewcVSITECONSTR);
+            wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
         }
 
         if (bNS && !(bFirstStep && ir->bContinuation))
@@ -1264,9 +1264,9 @@ void gmx::LegacySimulator::do_md()
             if (vsite != nullptr && needVirtualVelocitiesThisStep)
             {
                 // Positions were calculated earlier
-                wallcycle_start(wcycle, ewcVSITECONSTR);
+                wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
                 vsite->construct(state->x, state->v, state->box, VSiteOperation::Velocities);
-                wallcycle_stop(wcycle, ewcVSITECONSTR);
+                wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
             }
         }
 
@@ -1407,7 +1407,7 @@ void gmx::LegacySimulator::do_md()
 
         if (!useGpuForUpdate)
         {
-            wallcycle_start(wcycle, ewcUPDATE);
+            wallcycle_start(wcycle, WallCycleCounter::Update);
         }
         /* UPDATE PRESSURE VARIABLES IN TROTTER FORMULATION WITH CONSTRAINTS */
         if (bTrotter)
@@ -1564,7 +1564,7 @@ void gmx::LegacySimulator::do_md()
                 upd.update_coords(
                         *ir, step, mdatoms, state, forceCombined, fcdata, ekind, M, etrtPOSITION, cr, constr != nullptr);
 
-                wallcycle_stop(wcycle, ewcUPDATE);
+                wallcycle_stop(wcycle, WallCycleCounter::Update);
 
                 constrain_coordinates(constr,
                                       do_log,
@@ -1934,7 +1934,7 @@ void gmx::LegacySimulator::do_md()
             rescale_membed(step_rel, membed, as_rvec_array(state_global->x.data()));
         }
 
-        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
         if (DOMAINDECOMP(cr) && wcycle)
         {
             dd_cycles_add(cr->dd, cycles, ddCyclStep);
diff --git a/src/gromacs/mdrun/mimic.cpp b/src/gromacs/mdrun/mimic.cpp
index d5403f1945..25feef03cd 100644
--- a/src/gromacs/mdrun/mimic.cpp
+++ b/src/gromacs/mdrun/mimic.cpp
@@ -387,7 +387,7 @@ void gmx::LegacySimulator::do_mimic()
     }
 
     walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
+    wallcycle_start(wcycle, WallCycleCounter::Run);
     print_start(fplog, cr, walltime_accounting, "mdrun");
 
     /***********************************************************
@@ -438,7 +438,7 @@ void gmx::LegacySimulator::do_mimic()
     while (!isLastStep)
     {
         isLastStep = (isLastStep || (ir->nsteps >= 0 && step_rel == ir->nsteps));
-        wallcycle_start(wcycle, ewcSTEP);
+        wallcycle_start(wcycle, WallCycleCounter::Step);
 
         t = step;
 
@@ -464,9 +464,9 @@ void gmx::LegacySimulator::do_mimic()
             }
             if (constructVsites)
             {
-                wallcycle_start(wcycle, ewcVSITECONSTR);
+                wallcycle_start(wcycle, WallCycleCounter::VsiteConstr);
                 vsite->construct(state->x, state->v, state->box, VSiteOperation::PositionsAndVelocities);
-                wallcycle_stop(wcycle, ewcVSITECONSTR);
+                wallcycle_stop(wcycle, WallCycleCounter::VsiteConstr);
             }
         }
 
@@ -756,7 +756,7 @@ void gmx::LegacySimulator::do_mimic()
             print_time(stderr, walltime_accounting, step, ir, cr);
         }
 
-        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
         if (DOMAINDECOMP(cr) && wcycle)
         {
             dd_cycles_add(cr->dd, cycles, ddCyclStep);
diff --git a/src/gromacs/mdrun/minimize.cpp b/src/gromacs/mdrun/minimize.cpp
index e564223b81..2725353d5c 100644
--- a/src/gromacs/mdrun/minimize.cpp
+++ b/src/gromacs/mdrun/minimize.cpp
@@ -140,18 +140,18 @@ typedef struct em_state
 static void print_em_start(FILE*                     fplog,
                            const t_commrec*          cr,
                            gmx_walltime_accounting_t walltime_accounting,
-                           gmx_wallcycle_t           wcycle,
+                           gmx_wallcycle*            wcycle,
                            const char*               name)
 {
     walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
+    wallcycle_start(wcycle, WallCycleCounter::Run);
     print_start(fplog, cr, walltime_accounting, name);
 }
 
 //! Stop counting time for EM
-static void em_time_end(gmx_walltime_accounting_t walltime_accounting, gmx_wallcycle_t wcycle)
+static void em_time_end(gmx_walltime_accounting_t walltime_accounting, gmx_wallcycle* wcycle)
 {
-    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_stop(wcycle, WallCycleCounter::Run);
 
     walltime_accounting_end_time(walltime_accounting);
 }
@@ -516,7 +516,7 @@ static void init_em(FILE*                fplog,
 static void finish_em(const t_commrec*          cr,
                       gmx_mdoutf_t              outf,
                       gmx_walltime_accounting_t walltime_accounting,
-                      gmx_wallcycle_t           wcycle)
+                      gmx_wallcycle*            wcycle)
 {
     if (!thisRankHasDuty(cr, DUTY_PME))
     {
@@ -793,7 +793,7 @@ static void em_dd_partition_system(FILE*                fplog,
                                    VirtualSitesHandler* vsite,
                                    gmx::Constraints*    constr,
                                    t_nrnb*              nrnb,
-                                   gmx_wallcycle_t      wcycle)
+                                   gmx_wallcycle*       wcycle)
 {
     /* Repartition the domain decomposition */
     dd_partition_system(fplog,
@@ -915,7 +915,7 @@ public:
     //! Manages flop accounting.
     t_nrnb* nrnb;
     //! Manages wall cycle accounting.
-    gmx_wallcycle_t wcycle;
+    gmx_wallcycle* wcycle;
     //! Coordinates global reduction.
     gmx_global_stat_t gstat;
     //! Handles virtual sites.
@@ -1026,7 +1026,7 @@ void EnergyEvaluator::run(em_state_t* ems, rvec mu_tot, tensor vir, tensor pres,
     /* Communicate stuff when parallel */
     if (PAR(cr) && inputrec->eI != IntegrationAlgorithm::NM)
     {
-        wallcycle_start(wcycle, ewcMoveE);
+        wallcycle_start(wcycle, WallCycleCounter::MoveE);
 
         global_stat(*gstat,
                     cr,
@@ -1041,7 +1041,7 @@ void EnergyEvaluator::run(em_state_t* ems, rvec mu_tot, tensor vir, tensor pres,
                     FALSE,
                     CGLO_ENERGY | CGLO_PRESSURE | CGLO_CONSTRAINT);
 
-        wallcycle_stop(wcycle, ewcMoveE);
+        wallcycle_stop(wcycle, WallCycleCounter::MoveE);
     }
 
     if (fr->dispersionCorrection)
diff --git a/src/gromacs/mdrun/rerun.cpp b/src/gromacs/mdrun/rerun.cpp
index e1234ade76..157d18e55b 100644
--- a/src/gromacs/mdrun/rerun.cpp
+++ b/src/gromacs/mdrun/rerun.cpp
@@ -431,7 +431,7 @@ void gmx::LegacySimulator::do_rerun()
     }
 
     walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
+    wallcycle_start(wcycle, WallCycleCounter::Run);
     print_start(fplog, cr, walltime_accounting, "mdrun");
 
     /***********************************************************
@@ -528,7 +528,7 @@ void gmx::LegacySimulator::do_rerun()
     isLastStep = (isLastStep || (ir->nsteps >= 0 && step_rel > ir->nsteps));
     while (!isLastStep)
     {
-        wallcycle_start(wcycle, ewcSTEP);
+        wallcycle_start(wcycle, WallCycleCounter::Step);
 
         if (rerun_fr.bStep)
         {
@@ -863,7 +863,7 @@ void gmx::LegacySimulator::do_rerun()
             rerun_parallel_comm(cr, &rerun_fr, &isLastStep);
         }
 
-        cycles = wallcycle_stop(wcycle, ewcSTEP);
+        cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
         if (DOMAINDECOMP(cr) && wcycle)
         {
             dd_cycles_add(cr->dd, cycles, ddCyclStep);
diff --git a/src/gromacs/mdrun/runner.cpp b/src/gromacs/mdrun/runner.cpp
index 33f9145889..45fbace426 100644
--- a/src/gromacs/mdrun/runner.cpp
+++ b/src/gromacs/mdrun/runner.cpp
@@ -653,7 +653,7 @@ static void finish_run(FILE*                     fplog,
                        const t_commrec*          cr,
                        const t_inputrec&         inputrec,
                        t_nrnb                    nrnb[],
-                       gmx_wallcycle_t           wcycle,
+                       gmx_wallcycle*            wcycle,
                        gmx_walltime_accounting_t walltime_accounting,
                        nonbonded_verlet_t*       nbv,
                        const gmx_pme_t*          pme,
@@ -803,7 +803,6 @@ int Mdrunner::mdrunner()
     real                        ewaldcoeff_q     = 0;
     real                        ewaldcoeff_lj    = 0;
     int                         nChargePerturbed = -1, nTypePerturbed = 0;
-    gmx_wallcycle_t             wcycle;
     gmx_walltime_accounting_t   walltime_accounting = nullptr;
     MembedHolder                membedHolder(filenames.size(), filenames.data());
 
@@ -1598,15 +1597,16 @@ int Mdrunner::mdrunner()
                         "The -resetstep functionality is deprecated, and may be removed in a "
                         "future version.");
     }
-    wcycle = wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
+    std::unique_ptr<gmx_wallcycle> wcycle =
+            wallcycle_init(fplog, mdrunOptions.timingOptions.resetStep, cr);
 
     if (PAR(cr))
     {
         /* Master synchronizes its value of reset_counters with all nodes
          * including PME only nodes */
-        int64_t reset_counters = wcycle_get_reset_counters(wcycle);
+        int64_t reset_counters = wcycle_get_reset_counters(wcycle.get());
         gmx_bcast(sizeof(reset_counters), &reset_counters, cr->mpi_comm_mysim);
-        wcycle_set_reset_counters(wcycle, reset_counters);
+        wcycle_set_reset_counters(wcycle.get(), reset_counters);
     }
 
     // Membrane embedding must be initialized before we call init_forcerec()
@@ -1680,7 +1680,7 @@ int Mdrunner::mdrunner()
                                         deviceStreamManager.get(),
                                         mtop,
                                         box,
-                                        wcycle);
+                                        wcycle.get());
         // TODO: Move the logic below to a GPU bonded builder
         if (runScheduleWork.simulationWork.useGpuBonded)
         {
@@ -1692,7 +1692,7 @@ int Mdrunner::mdrunner()
                     fr->ic->epsfac * fr->fudgeQQ,
                     deviceStreamManager->context(),
                     deviceStreamManager->bondedStream(havePPDomainDecomposition(cr)),
-                    wcycle);
+                    wcycle.get());
             fr->gpuBonded = gpuBonded.get();
         }
 
@@ -1910,7 +1910,7 @@ int Mdrunner::mdrunner()
 
         /* Let makeConstraints know whether we have essential dynamics constraints. */
         auto constr = makeConstraints(
-                mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr, ms, &nrnb, wcycle, fr->bMolPBC);
+                mtop, *inputrec, pull_work, doEssentialDynamics, fplog, cr, ms, &nrnb, wcycle.get(), fr->bMolPBC);
 
         /* Energy terms and groups */
         gmx_enerdata_t enerd(mtop.groups.groups[SimulationAtomGroupType::EnergyOutput].size(),
@@ -1927,7 +1927,7 @@ int Mdrunner::mdrunner()
         /* Set up interactive MD (IMD) */
         auto imdSession = makeImdSession(inputrec.get(),
                                          cr,
-                                         wcycle,
+                                         wcycle.get(),
                                          &enerd,
                                          ms,
                                          mtop,
@@ -1960,11 +1960,11 @@ int Mdrunner::mdrunner()
             fr->gpuForceReduction[gmx::AtomLocality::Local] = std::make_unique<gmx::GpuForceReduction>(
                     deviceStreamManager->context(),
                     deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedLocal),
-                    wcycle);
+                    wcycle.get());
             fr->gpuForceReduction[gmx::AtomLocality::NonLocal] = std::make_unique<gmx::GpuForceReduction>(
                     deviceStreamManager->context(),
                     deviceStreamManager->stream(gmx::DeviceStreamType::NonBondedNonLocal),
-                    wcycle);
+                    wcycle.get());
         }
 
         std::unique_ptr<gmx::StatePropagatorDataGpu> stateGpu;
@@ -1979,7 +1979,7 @@ int Mdrunner::mdrunner()
             GMX_RELEASE_ASSERT(deviceStreamManager != nullptr,
                                "GPU device stream manager should be initialized to use GPU.");
             stateGpu = std::make_unique<gmx::StatePropagatorDataGpu>(
-                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle);
+                    *deviceStreamManager, transferKind, pme_gpu_get_block_size(fr->pmedata), wcycle.get());
             fr->stateGpu = stateGpu.get();
         }
 
@@ -1993,7 +1993,7 @@ int Mdrunner::mdrunner()
 
 
         simulatorBuilder.add(SimulatorEnv(fplog, cr, ms, mdlog, oenv));
-        simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle));
+        simulatorBuilder.add(Profiling(&nrnb, walltime_accounting, wcycle.get()));
         simulatorBuilder.add(ConstraintsParam(
                 constr.get(), enforcedRotation ? enforcedRotation->getLegacyEnfrot() : nullptr, vsite.get()));
         // TODO: Separate `fr` to a separate add, and make the `build` handle the coupling sensibly.
@@ -2033,14 +2033,14 @@ int Mdrunner::mdrunner()
         gmx_pmeonly(pmedata,
                     cr,
                     &nrnb,
-                    wcycle,
+                    wcycle.get(),
                     walltime_accounting,
                     inputrec.get(),
                     pmeRunMode,
                     deviceStreamManager.get());
     }
 
-    wallcycle_stop(wcycle, ewcRUN);
+    wallcycle_stop(wcycle.get(), WallCycleCounter::Run);
 
     /* Finish up, write some stuff
      * if rerunMD, don't write last frame again
@@ -2050,14 +2050,12 @@ int Mdrunner::mdrunner()
                cr,
                *inputrec,
                &nrnb,
-               wcycle,
+               wcycle.get(),
                walltime_accounting,
                fr ? fr->nbv.get() : nullptr,
                pmedata,
                EI_DYNAMICS(inputrec->eI) && !isMultiSim(ms));
 
-    // clean up cycle counter
-    wallcycle_destroy(wcycle);
 
     deviceStreamManager.reset(nullptr);
     // Free PME data
diff --git a/src/gromacs/mdrun/shellfc.cpp b/src/gromacs/mdrun/shellfc.cpp
index 0b630b6a52..af47d79566 100644
--- a/src/gromacs/mdrun/shellfc.cpp
+++ b/src/gromacs/mdrun/shellfc.cpp
@@ -957,7 +957,7 @@ void relax_shell_flexcon(FILE*                         fplog,
                          tensor                        force_vir,
                          const t_mdatoms&              md,
                          t_nrnb*                       nrnb,
-                         gmx_wallcycle_t               wcycle,
+                         gmx_wallcycle*                wcycle,
                          gmx_shellfc_t*                shfc,
                          t_forcerec*                   fr,
                          gmx::MdrunScheduleWorkload*   runScheduleWork,
diff --git a/src/gromacs/mdrun/shellfc.h b/src/gromacs/mdrun/shellfc.h
index 6d3120fd98..b8a314679c 100644
--- a/src/gromacs/mdrun/shellfc.h
+++ b/src/gromacs/mdrun/shellfc.h
@@ -116,7 +116,7 @@ void relax_shell_flexcon(FILE*                               log,
                          tensor                              force_vir,
                          const t_mdatoms&                    md,
                          t_nrnb*                             nrnb,
-                         gmx_wallcycle_t                     wcycle,
+                         gmx_wallcycle*                      wcycle,
                          gmx_shellfc_t*                      shfc,
                          t_forcerec*                         fr,
                          gmx::MdrunScheduleWorkload*         runScheduleWork,
diff --git a/src/gromacs/mdrun/tpi.cpp b/src/gromacs/mdrun/tpi.cpp
index 056cbb29d4..758e2ae691 100644
--- a/src/gromacs/mdrun/tpi.cpp
+++ b/src/gromacs/mdrun/tpi.cpp
@@ -305,7 +305,7 @@ void LegacySimulator::do_tpi()
 
     /* Print to log file  */
     walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
+    wallcycle_start(wcycle, WallCycleCounter::Run);
     print_start(fplog, cr, walltime_accounting, "Test Particle Insertion");
 
     /* The last charge group is the group to be inserted */
diff --git a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
index 16fbe131f8..69e11d69c0 100644
--- a/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
+++ b/src/gromacs/mdtypes/state_propagator_data_gpu_impl_gpu.cpp
@@ -136,8 +136,8 @@ StatePropagatorDataGpu::Impl::~Impl() {}
 
 void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
 {
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
     numAtomsLocal_ = numAtomsLocal;
     numAtomsAll_   = numAtomsAll;
@@ -174,8 +174,8 @@ void StatePropagatorDataGpu::Impl::reinit(int numAtomsLocal, int numAtomsAll)
         clearDeviceBufferAsync(&d_f_, 0, d_fCapacity_, *localStream_);
     }
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 std::tuple<int, int> StatePropagatorDataGpu::Impl::getAtomRangesFromAtomLocality(AtomLocality atomLocality)
@@ -316,8 +316,8 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying positions with given atom locality.");
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
     copyToDevice(d_x_, h_x, d_xSize_, atomLocality, *deviceStream);
 
@@ -330,8 +330,8 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesToGpu(const gmx::ArrayRef<cons
         xReadyOnDevice_[atomLocality].markEvent(*deviceStream);
     }
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 GpuEventSynchronizer*
@@ -363,10 +363,10 @@ StatePropagatorDataGpu::Impl::getCoordinatesReadyOnDeviceEvent(AtomLocality atom
 
 void StatePropagatorDataGpu::Impl::waitCoordinatesCopiedToDevice(AtomLocality atomLocality)
 {
-    wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+    wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
     GMX_ASSERT(atomLocality < AtomLocality::Count, "Wrong atom locality.");
     xReadyOnDevice_[atomLocality].waitForEvent();
-    wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+    wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
 }
 
 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::xUpdatedOnDevice()
@@ -381,22 +381,22 @@ void StatePropagatorDataGpu::Impl::copyCoordinatesFromGpu(gmx::ArrayRef<gmx::RVe
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying positions with given atom locality.");
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
     copyFromDevice(h_x, d_x_, d_xSize_, atomLocality, *deviceStream);
     // Note: unlike copyCoordinatesToGpu this is not used in OpenCL, and the conditional is not needed.
     xReadyOnHost_[atomLocality].markEvent(*deviceStream);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 void StatePropagatorDataGpu::Impl::waitCoordinatesReadyOnHost(AtomLocality atomLocality)
 {
-    wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+    wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
     xReadyOnHost_[atomLocality].waitForEvent();
-    wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+    wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
 }
 
 
@@ -413,14 +413,14 @@ void StatePropagatorDataGpu::Impl::copyVelocitiesToGpu(const gmx::ArrayRef<const
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying velocities with given atom locality.");
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
     copyToDevice(d_v_, h_v, d_vSize_, atomLocality, *deviceStream);
     vReadyOnDevice_[atomLocality].markEvent(*deviceStream);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getVelocitiesReadyOnDeviceEvent(AtomLocality atomLocality)
@@ -436,21 +436,21 @@ void StatePropagatorDataGpu::Impl::copyVelocitiesFromGpu(gmx::ArrayRef<gmx::RVec
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying velocities with given atom locality.");
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
     copyFromDevice(h_v, d_v_, d_vSize_, atomLocality, *deviceStream);
     vReadyOnHost_[atomLocality].markEvent(*deviceStream);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 void StatePropagatorDataGpu::Impl::waitVelocitiesReadyOnHost(AtomLocality atomLocality)
 {
-    wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+    wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
     vReadyOnHost_[atomLocality].waitForEvent();
-    wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+    wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
 }
 
 
@@ -467,14 +467,14 @@ void StatePropagatorDataGpu::Impl::copyForcesToGpu(const gmx::ArrayRef<const gmx
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying forces with given atom locality.");
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
     copyToDevice(d_f_, h_f, d_fSize_, atomLocality, *deviceStream);
     fReadyOnDevice_[atomLocality].markEvent(*deviceStream);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
@@ -484,13 +484,13 @@ void StatePropagatorDataGpu::Impl::clearForcesOnGpu(AtomLocality atomLocality)
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for clearing forces with given atom locality.");
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
     clearOnDevice(d_f_, d_fSize_, atomLocality, *deviceStream);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 GpuEventSynchronizer* StatePropagatorDataGpu::Impl::getForcesReadyOnDeviceEvent(AtomLocality atomLocality,
@@ -518,21 +518,21 @@ void StatePropagatorDataGpu::Impl::copyForcesFromGpu(gmx::ArrayRef<gmx::RVec> h_
     GMX_ASSERT(deviceStream != nullptr,
                "No stream is valid for copying forces with given atom locality.");
 
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
 
     copyFromDevice(h_f, d_f_, d_fSize_, atomLocality, *deviceStream);
     fReadyOnHost_[atomLocality].markEvent(*deviceStream);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_STATE_PROPAGATOR_DATA);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchStatePropagatorData);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 void StatePropagatorDataGpu::Impl::waitForcesReadyOnHost(AtomLocality atomLocality)
 {
-    wallcycle_start(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+    wallcycle_start(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
     fReadyOnHost_[atomLocality].waitForEvent();
-    wallcycle_stop(wcycle_, ewcWAIT_GPU_STATE_PROPAGATOR_DATA);
+    wallcycle_stop(wcycle_, WallCycleCounter::WaitGpuStatePropagatorData);
 }
 
 const DeviceStream* StatePropagatorDataGpu::Impl::getUpdateStream()
diff --git a/src/gromacs/modularsimulator/propagator.cpp b/src/gromacs/modularsimulator/propagator.cpp
index 7a6074d0cf..09edcd0890 100644
--- a/src/gromacs/modularsimulator/propagator.cpp
+++ b/src/gromacs/modularsimulator/propagator.cpp
@@ -154,7 +154,7 @@ template<NumVelocityScalingValues        numStartVelocityScalingValues,
          NumVelocityScalingValues        numEndVelocityScalingValues>
 void Propagator<IntegrationStep::PositionsOnly>::run()
 {
-    wallcycle_start(wcycle_, ewcUPDATE);
+    wallcycle_start(wcycle_, WallCycleCounter::Update);
 
     auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data());
     auto x  = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data());
@@ -178,7 +178,7 @@ void Propagator<IntegrationStep::PositionsOnly>::run()
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
     }
-    wallcycle_stop(wcycle_, ewcUPDATE);
+    wallcycle_stop(wcycle_, WallCycleCounter::Update);
 }
 
 //! Propagation (velocity only)
@@ -188,7 +188,7 @@ template<NumVelocityScalingValues        numStartVelocityScalingValues,
          NumVelocityScalingValues        numEndVelocityScalingValues>
 void Propagator<IntegrationStep::VelocitiesOnly>::run()
 {
-    wallcycle_start(wcycle_, ewcUPDATE);
+    wallcycle_start(wcycle_, WallCycleCounter::Update);
 
     auto v = as_rvec_array(statePropagatorData_->velocitiesView().paddedArrayRef().data());
     auto f = as_rvec_array(statePropagatorData_->constForcesView().force().data());
@@ -258,7 +258,7 @@ void Propagator<IntegrationStep::VelocitiesOnly>::run()
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
     }
-    wallcycle_stop(wcycle_, ewcUPDATE);
+    wallcycle_stop(wcycle_, WallCycleCounter::Update);
 }
 
 //! Propagation (leapfrog case - position and velocity)
@@ -268,7 +268,7 @@ template<NumVelocityScalingValues        numStartVelocityScalingValues,
          NumVelocityScalingValues        numEndVelocityScalingValues>
 void Propagator<IntegrationStep::LeapFrog>::run()
 {
-    wallcycle_start(wcycle_, ewcUPDATE);
+    wallcycle_start(wcycle_, WallCycleCounter::Update);
 
     auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data());
     auto x  = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data());
@@ -342,7 +342,7 @@ void Propagator<IntegrationStep::LeapFrog>::run()
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
     }
-    wallcycle_stop(wcycle_, ewcUPDATE);
+    wallcycle_stop(wcycle_, WallCycleCounter::Update);
 }
 
 //! Propagation (velocity verlet stage 2 - velocity and position)
@@ -352,7 +352,7 @@ template<NumVelocityScalingValues        numStartVelocityScalingValues,
          NumVelocityScalingValues        numEndVelocityScalingValues>
 void Propagator<IntegrationStep::VelocityVerletPositionsAndVelocities>::run()
 {
-    wallcycle_start(wcycle_, ewcUPDATE);
+    wallcycle_start(wcycle_, WallCycleCounter::Update);
 
     auto xp = as_rvec_array(statePropagatorData_->positionsView().paddedArrayRef().data());
     auto x  = as_rvec_array(statePropagatorData_->constPositionsView().paddedArrayRef().data());
@@ -426,7 +426,7 @@ void Propagator<IntegrationStep::VelocityVerletPositionsAndVelocities>::run()
         }
         GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR
     }
-    wallcycle_stop(wcycle_, ewcUPDATE);
+    wallcycle_stop(wcycle_, WallCycleCounter::Update);
 }
 
 template<IntegrationStep algorithm>
diff --git a/src/gromacs/modularsimulator/simulatoralgorithm.cpp b/src/gromacs/modularsimulator/simulatoralgorithm.cpp
index afce03938b..ff20ef7cab 100644
--- a/src/gromacs/modularsimulator/simulatoralgorithm.cpp
+++ b/src/gromacs/modularsimulator/simulatoralgorithm.cpp
@@ -228,7 +228,7 @@ void ModularSimulatorAlgorithm::simulatorSetup()
     }
 
     walltime_accounting_start_time(walltime_accounting);
-    wallcycle_start(wcycle, ewcRUN);
+    wallcycle_start(wcycle, WallCycleCounter::Run);
     print_start(fplog, cr, walltime_accounting, "mdrun");
 
     step_ = inputrec->init_step;
@@ -275,7 +275,7 @@ void ModularSimulatorAlgorithm::preStep(Step step, Time gmx_unused time, bool is
     stophandlerCurrentStep_ = step;
     stopHandler_->setSignal();
 
-    wallcycle_start(wcycle, ewcSTEP);
+    wallcycle_start(wcycle, WallCycleCounter::Step);
 }
 
 void ModularSimulatorAlgorithm::postStep(Step step, Time gmx_unused time)
@@ -301,7 +301,7 @@ void ModularSimulatorAlgorithm::postStep(Step step, Time gmx_unused time)
         print_time(stderr, walltime_accounting, step, inputrec, cr);
     }
 
-    double cycles = wallcycle_stop(wcycle, ewcSTEP);
+    double cycles = wallcycle_stop(wcycle, WallCycleCounter::Step);
     if (DOMAINDECOMP(cr) && wcycle)
     {
         dd_cycles_add(cr->dd, static_cast<float>(cycles), ddCyclStep);
diff --git a/src/gromacs/modularsimulator/statepropagatordata.cpp b/src/gromacs/modularsimulator/statepropagatordata.cpp
index 1cf9df615e..6c1d1a6ad9 100644
--- a/src/gromacs/modularsimulator/statepropagatordata.cpp
+++ b/src/gromacs/modularsimulator/statepropagatordata.cpp
@@ -393,7 +393,7 @@ StatePropagatorData::Element::registerTrajectoryWriterCallback(TrajectoryEvent e
 
 void StatePropagatorData::Element::write(gmx_mdoutf_t outf, Step currentStep, Time currentTime)
 {
-    wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ);
+    wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
     unsigned int mdof_flags = 0;
     if (do_per_step(currentStep, nstxout_))
     {
@@ -430,7 +430,7 @@ void StatePropagatorData::Element::write(gmx_mdoutf_t outf, Step currentStep, Ti
 
     if (mdof_flags == 0)
     {
-        wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ);
+        wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
         return;
     }
     GMX_ASSERT(localStateBackup_, "Trajectory writing called, but no state saved.");
@@ -455,7 +455,7 @@ void StatePropagatorData::Element::write(gmx_mdoutf_t outf, Step currentStep, Ti
     {
         localStateBackup_.reset();
     }
-    wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ);
+    wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
 }
 
 void StatePropagatorData::Element::elementSetup()
@@ -619,7 +619,7 @@ void StatePropagatorData::Element::trajectoryWriterTeardown(gmx_mdoutf* gmx_unus
 
     GMX_ASSERT(localStateBackup_, "Final trajectory writing called, but no state saved.");
 
-    wallcycle_start(mdoutf_get_wcycle(outf), ewcTRAJ);
+    wallcycle_start(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
     if (DOMAINDECOMP(cr_))
     {
         auto globalXRef =
@@ -664,7 +664,7 @@ void StatePropagatorData::Element::trajectoryWriterTeardown(gmx_mdoutf* gmx_unus
                             pbcType_,
                             localStateBackup_->box);
     }
-    wallcycle_stop(mdoutf_get_wcycle(outf), ewcTRAJ);
+    wallcycle_stop(mdoutf_get_wcycle(outf), WallCycleCounter::Traj);
 }
 
 std::optional<SignallerCallback> StatePropagatorData::Element::registerLastStepCallback()
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index cdc35d093a..bf9c25a5e7 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -286,18 +286,18 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
             // we start without counting and only when the task finished we issue a
             // start/stop to increment.
             // GpuTaskCompletion::Wait mode the timing is expected to be done in the caller.
-            wallcycle_start_nocount(wcycle, ewcWAIT_GPU_NB_L);
+            wallcycle_start_nocount(wcycle, WallCycleCounter::WaitGpuNbL);
 
             if (!haveStreamTasksCompleted(*nb->deviceStreams[iLocality]))
             {
-                wallcycle_stop(wcycle, ewcWAIT_GPU_NB_L);
+                wallcycle_stop(wcycle, WallCycleCounter::WaitGpuNbL);
 
                 // Early return to skip the steps below that we have to do only
                 // after the NB task completed
                 return false;
             }
 
-            wallcycle_increment_event_count(wcycle, ewcWAIT_GPU_NB_L);
+            wallcycle_increment_event_count(wcycle, WallCycleCounter::WaitGpuNbL);
         }
         else if (haveResultToWaitFor)
         {
@@ -361,8 +361,8 @@ float gpu_wait_finish_task(NbnxmGpu*                nb,
                            gmx_wallcycle*           wcycle)
 {
     auto cycleCounter = (atomToInteractionLocality(aloc) == InteractionLocality::Local)
-                                ? ewcWAIT_GPU_NB_L
-                                : ewcWAIT_GPU_NB_NL;
+                                ? WallCycleCounter::WaitGpuNbL
+                                : WallCycleCounter::WaitGpuNbNL;
 
     wallcycle_start(wcycle, cycleCounter);
     gpu_try_finish_task(nb, stepWork, aloc, e_lj, e_el, shiftForces, GpuTaskCompletion::Wait, wcycle);
diff --git a/src/gromacs/nbnxm/kerneldispatch.cpp b/src/gromacs/nbnxm/kerneldispatch.cpp
index 846e2bbdb3..93e28f8487 100644
--- a/src/gromacs/nbnxm/kerneldispatch.cpp
+++ b/src/gromacs/nbnxm/kerneldispatch.cpp
@@ -259,7 +259,7 @@ static void nbnxn_kernel_cpu(const PairlistSet&             pairlistSet,
     auto shiftVecPointer = as_rvec_array(shiftVectors.data());
 
     int gmx_unused nthreads = gmx_omp_nthreads_get(emntNonbonded);
-    wallcycle_sub_start(wcycle, ewcsNONBONDED_CLEAR);
+    wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedClear);
 #pragma omp parallel for schedule(static) num_threads(nthreads)
     for (gmx::index nb = 0; nb < pairlists.ssize(); nb++)
     {
@@ -276,8 +276,8 @@ static void nbnxn_kernel_cpu(const PairlistSet&             pairlistSet,
 
         if (nb == 0)
         {
-            wallcycle_sub_stop(wcycle, ewcsNONBONDED_CLEAR);
-            wallcycle_sub_start(wcycle, ewcsNONBONDED_KERNEL);
+            wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedClear);
+            wallcycle_sub_start(wcycle, WallCycleSubCounter::NonbondedKernel);
         }
 
         // TODO: Change to reference
@@ -375,7 +375,7 @@ static void nbnxn_kernel_cpu(const PairlistSet&             pairlistSet,
             }
         }
     }
-    wallcycle_sub_stop(wcycle, ewcsNONBONDED_KERNEL);
+    wallcycle_sub_stop(wcycle, WallCycleSubCounter::NonbondedKernel);
 
     if (stepWork.computeEnergy)
     {
@@ -547,7 +547,7 @@ void nonbonded_verlet_t::dispatchFreeEnergyKernel(gmx::InteractionLocality
     GMX_ASSERT(gmx_omp_nthreads_get(emntNonbonded) == nbl_fep.ssize(),
                "Number of lists should be same as number of NB threads");
 
-    wallcycle_sub_start(wcycle_, ewcsNONBONDED_FEP);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::NonbondedFep);
 #pragma omp parallel for schedule(static) num_threads(nbl_fep.ssize())
     for (gmx::index th = 0; th < nbl_fep.ssize(); th++)
     {
@@ -640,5 +640,5 @@ void nonbonded_verlet_t::dispatchFreeEnergyKernel(gmx::InteractionLocality
                             + dvdl_nb[FreeEnergyPerturbationCouplingType::Coul]);
         }
     }
-    wallcycle_sub_stop(wcycle_, ewcsNONBONDED_FEP);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NonbondedFep);
 }
diff --git a/src/gromacs/nbnxm/nbnxm.cpp b/src/gromacs/nbnxm/nbnxm.cpp
index 62febc366f..ecc0ecc2ca 100644
--- a/src/gromacs/nbnxm/nbnxm.cpp
+++ b/src/gromacs/nbnxm/nbnxm.cpp
@@ -147,27 +147,27 @@ void nonbonded_verlet_t::setAtomProperties(gmx::ArrayRef<const int>  atomTypes,
 void nonbonded_verlet_t::convertCoordinates(const gmx::AtomLocality        locality,
                                             gmx::ArrayRef<const gmx::RVec> coordinates)
 {
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_X_BUF_OPS);
+    wallcycle_start(wcycle_, WallCycleCounter::NbXFBufOps);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::NBXBufOps);
 
     nbnxn_atomdata_copy_x_to_nbat_x(
             pairSearch_->gridSet(), locality, as_rvec_array(coordinates.data()), nbat.get());
 
-    wallcycle_sub_stop(wcycle_, ewcsNB_X_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NBXBufOps);
+    wallcycle_stop(wcycle_, WallCycleCounter::NbXFBufOps);
 }
 
 void nonbonded_verlet_t::convertCoordinatesGpu(const gmx::AtomLocality locality,
                                                DeviceBuffer<gmx::RVec> d_x,
                                                GpuEventSynchronizer*   xReadyOnDevice)
 {
-    wallcycle_start(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
+    wallcycle_start(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuNBXBufOps);
 
     nbnxn_atomdata_x_to_nbat_x_gpu(pairSearch_->gridSet(), locality, gpu_nbv, d_x, xReadyOnDevice);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NB_X_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNBXBufOps);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
 
 gmx::ArrayRef<const int> nonbonded_verlet_t::getGridIndices() const
@@ -186,13 +186,13 @@ void nonbonded_verlet_t::atomdata_add_nbat_f_to_f(const gmx::AtomLocality  local
         return;
     }
 
-    wallcycle_start(wcycle_, ewcNB_XF_BUF_OPS);
-    wallcycle_sub_start(wcycle_, ewcsNB_F_BUF_OPS);
+    wallcycle_start(wcycle_, WallCycleCounter::NbXFBufOps);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::NBFBufOps);
 
     reduceForces(nbat.get(), locality, pairSearch_->gridSet(), as_rvec_array(force.data()));
 
-    wallcycle_sub_stop(wcycle_, ewcsNB_F_BUF_OPS);
-    wallcycle_stop(wcycle_, ewcNB_XF_BUF_OPS);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::NBFBufOps);
+    wallcycle_stop(wcycle_, WallCycleCounter::NbXFBufOps);
 }
 
 int nonbonded_verlet_t::getNumAtoms(const gmx::AtomLocality locality) const
diff --git a/src/gromacs/nbnxm/prunekerneldispatch.cpp b/src/gromacs/nbnxm/prunekerneldispatch.cpp
index f3e4dee503..98d05e13b1 100644
--- a/src/gromacs/nbnxm/prunekerneldispatch.cpp
+++ b/src/gromacs/nbnxm/prunekerneldispatch.cpp
@@ -100,8 +100,8 @@ void nonbonded_verlet_t::dispatchPruneKernelCpu(const gmx::InteractionLocality i
 
 void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step)
 {
-    wallcycle_start_nocount(wcycle_, ewcLAUNCH_GPU);
-    wallcycle_sub_start_nocount(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start_nocount(wcycle_, WallCycleSubCounter::LaunchGpuNonBonded);
 
     const bool stepIsEven =
             (pairlistSets().numStepsWithPairlist(step) % (2 * pairlistSets().params().mtsFactor) == 0);
@@ -111,6 +111,6 @@ void nonbonded_verlet_t::dispatchPruneKernelGpu(int64_t step)
             stepIsEven ? gmx::InteractionLocality::Local : gmx::InteractionLocality::NonLocal,
             pairlistSets().params().numRollingPruningParts);
 
-    wallcycle_sub_stop(wcycle_, ewcsLAUNCH_GPU_NONBONDED);
-    wallcycle_stop(wcycle_, ewcLAUNCH_GPU);
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuNonBonded);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
 }
diff --git a/src/gromacs/swap/swapcoords.cpp b/src/gromacs/swap/swapcoords.cpp
index 11c152313c..9414963873 100644
--- a/src/gromacs/swap/swapcoords.cpp
+++ b/src/gromacs/swap/swapcoords.cpp
@@ -2053,7 +2053,7 @@ gmx_bool do_swapcoords(t_commrec*        cr,
     rvec                com_solvent, com_particle; /* solvent and swap molecule's center of mass */
 
 
-    wallcycle_start(wcycle, ewcSWAP);
+    wallcycle_start(wcycle, WallCycleCounter::Swap);
 
     set_pbc(s->pbc, ir->pbcType, box);
 
@@ -2242,7 +2242,7 @@ gmx_bool do_swapcoords(t_commrec*        cr,
 
     } /* end of if(bSwap) */
 
-    wallcycle_stop(wcycle, ewcSWAP);
+    wallcycle_stop(wcycle, WallCycleCounter::Swap);
 
     return bSwap;
 }
diff --git a/src/gromacs/timing/tests/timing.cpp b/src/gromacs/timing/tests/timing.cpp
index 158d6aba69..3155ad4e14 100644
--- a/src/gromacs/timing/tests/timing.cpp
+++ b/src/gromacs/timing/tests/timing.cpp
@@ -44,6 +44,7 @@
 #include "config.h"
 
 #include <chrono>
+#include <memory>
 #include <thread>
 
 #include "gromacs/timing/cyclecounter.h"
@@ -72,31 +73,30 @@ class TimingTest : public ::testing::Test
 {
 public:
     TimingTest() : wcycle(wallcycle_init(nullptr, 0, nullptr)) {}
-    ~TimingTest() override { wallcycle_destroy(wcycle); }
 
 protected:
-    const int       delayInMilliseconds = 1;
-    gmx_wallcycle_t wcycle;
+    const int                      delayInMilliseconds = 1;
+    std::unique_ptr<gmx_wallcycle> wcycle;
 };
 
 
 //! Test whether the we can run the cycle counter.
 TEST_F(TimingTest, RunWallCycle)
 {
-    int    probe = 0, ref = 1;
-    int    n1, n2;
-    double c1, c2;
+    WallCycleCounter probe = WallCycleCounter::Run, ref = WallCycleCounter::Step;
+    int              n1, n2;
+    double           c1, c2;
 
     //! credit cycles from enclosing call to the ref field of wcycle
-    wallcycle_start(wcycle, ref);
+    wallcycle_start(wcycle.get(), ref);
     //! cycles from the probe call
-    wallcycle_start(wcycle, probe);
+    wallcycle_start(wcycle.get(), probe);
     sleepForMilliseconds(delayInMilliseconds);
-    wallcycle_stop(wcycle, probe);
-    wallcycle_stop(wcycle, ref);
+    wallcycle_stop(wcycle.get(), probe);
+    wallcycle_stop(wcycle.get(), ref);
     //! extract both
-    wallcycle_get(wcycle, probe, &n1, &c1);
-    wallcycle_get(wcycle, ref, &n2, &c2);
+    wallcycle_get(wcycle.get(), probe, &n1, &c1);
+    wallcycle_get(wcycle.get(), ref, &n2, &c2);
 
     EXPECT_EQ(n1, n2);
     EXPECT_DOUBLE_EQ_TOL(c1, c2, relativeToleranceAsFloatingPoint(c1, 5e-3));
@@ -108,17 +108,17 @@ TEST_F(TimingTest, RunWallCycleSub)
 {
     if (useCycleSubcounters)
     {
-        int    probe = 0;
-        int    ref   = 1;
-        int    n1, n2;
-        double c1, c2;
-        wallcycle_sub_start(wcycle, ref);
-        wallcycle_sub_start(wcycle, probe);
+        WallCycleSubCounter probe = WallCycleSubCounter::DDRedist;
+        WallCycleSubCounter ref   = WallCycleSubCounter::DDGrid;
+        int                 n1, n2;
+        double              c1, c2;
+        wallcycle_sub_start(wcycle.get(), ref);
+        wallcycle_sub_start(wcycle.get(), probe);
         sleepForMilliseconds(delayInMilliseconds);
-        wallcycle_sub_stop(wcycle, probe);
-        wallcycle_sub_stop(wcycle, ref);
-        wallcycle_sub_get(wcycle, probe, &n1, &c1);
-        wallcycle_sub_get(wcycle, ref, &n2, &c2);
+        wallcycle_sub_stop(wcycle.get(), probe);
+        wallcycle_sub_stop(wcycle.get(), ref);
+        wallcycle_sub_get(wcycle.get(), probe, &n1, &c1);
+        wallcycle_sub_get(wcycle.get(), ref, &n2, &c2);
 
         EXPECT_EQ(n1, n2);
         EXPECT_DOUBLE_EQ_TOL(c1, c2, relativeToleranceAsFloatingPoint(c1, 5e-3));
diff --git a/src/gromacs/timing/wallcycle.cpp b/src/gromacs/timing/wallcycle.cpp
index 4ea520268c..6f49958713 100644
--- a/src/gromacs/timing/wallcycle.cpp
+++ b/src/gromacs/timing/wallcycle.cpp
@@ -44,6 +44,7 @@
 #include <cstdlib>
 
 #include <array>
+#include <memory>
 #include <vector>
 
 #include "gromacs/math/functions.h"
@@ -51,12 +52,15 @@
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/timing/wallcyclereporting.h"
+#include "gromacs/utility/arrayref.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/enumerationhelpers.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/logger.h"
 #include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/snprintf.h"
+#include "gromacs/utility/stringutil.h"
 
 //! Whether wallcycle debugging is enabled
 constexpr bool gmx_unused enableWallcycleDebug = (DEBUG_WCYCLE != 0);
@@ -71,87 +75,97 @@ constexpr bool gmx_unused debugPrintDepth = false /* enableWallcycleDebug */;
 
 /* Each name should not exceed 19 printing characters
    (ie. terminating null can be twentieth) */
-static const char* wcn[ewcNR] = { "Run",
-                                  "Step",
-                                  "PP during PME",
-                                  "Domain decomp.",
-                                  "DD comm. load",
-                                  "DD comm. bounds",
-                                  "Vsite constr.",
-                                  "Send X to PME",
-                                  "Neighbor search",
-                                  "Launch GPU ops.",
-                                  "Comm. coord.",
-                                  "Force",
-                                  "Wait + Comm. F",
-                                  "PME mesh",
-                                  "PME redist. X/F",
-                                  "PME spread",
-                                  "PME gather",
-                                  "PME 3D-FFT",
-                                  "PME 3D-FFT Comm.",
-                                  "PME solve LJ",
-                                  "PME solve Elec",
-                                  "PME wait for PP",
-                                  "Wait + Recv. PME F",
-                                  "Wait PME GPU spread",
-                                  "PME 3D-FFT",
-                                  "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
-                                  "Wait PME GPU gather",
-                                  "Wait Bonded GPU",
-                                  "Reduce GPU PME F",
-                                  "Wait GPU NB nonloc.",
-                                  "Wait GPU NB local",
-                                  "Wait GPU state copy",
-                                  "NB X/F buffer ops.",
-                                  "Vsite spread",
-                                  "COM pull force",
-                                  "AWH",
-                                  "Write traj.",
-                                  "Update",
-                                  "Constraints",
-                                  "Comm. energies",
-                                  "Enforced rotation",
-                                  "Add rot. forces",
-                                  "Position swapping",
-                                  "IMD",
-                                  "Test" };
-
-static const char* wcsn[ewcsNR] = {
-    "DD redist.",
-    "DD NS grid + sort",
-    "DD setup comm.",
-    "DD make top.",
-    "DD make constr.",
-    "DD top. other",
-    "DD GPU ops.",
-    "NS grid local",
-    "NS grid non-loc.",
-    "NS search local",
-    "NS search non-loc.",
-    "Bonded F",
-    "Bonded-FEP F",
-    "Restraints F",
-    "Listed buffer ops.",
-    "Nonbonded pruning",
-    "Nonbonded F kernel",
-    "Nonbonded F clear",
-    "Nonbonded FEP",
-    "Launch NB GPU tasks",
-    "Launch Bonded GPU tasks",
-    "Launch PME GPU tasks",
-    "Launch state copy",
-    "Ewald F correction",
-    "NB X buffer ops.",
-    "NB F buffer ops.",
-    "Clear force buffer",
-    "Launch GPU NB X buffer ops.",
-    "Launch GPU NB F buffer ops.",
-    "Launch GPU Comm. coord.",
-    "Launch GPU Comm. force.",
-    "Launch GPU update",
-    "Test subcounter",
-};
+static const char* enumValuetoString(WallCycleCounter enumValue)
+{
+    constexpr gmx::EnumerationArray<WallCycleCounter, const char*> wallCycleCounterNames = {
+        "Run",
+        "Step",
+        "PP during PME",
+        "Domain decomp.",
+        "DD comm. load",
+        "DD comm. bounds",
+        "Vsite constr.",
+        "Send X to PME",
+        "Neighbor search",
+        "Launch GPU ops.",
+        "Comm. coord.",
+        "Force",
+        "Wait + Comm. F",
+        "PME mesh",
+        "PME redist. X/F",
+        "PME spread",
+        "PME gather",
+        "PME 3D-FFT",
+        "PME 3D-FFT Comm.",
+        "PME solve LJ",
+        "PME solve Elec",
+        "PME wait for PP",
+        "Wait + Recv. PME F",
+        "Wait PME GPU spread",
+        "PME 3D-FFT",
+        "PME solve", /* the strings for FFT/solve are repeated here for mixed mode counters */
+        "Wait PME GPU gather",
+        "Wait Bonded GPU",
+        "Reduce GPU PME F",
+        "Wait GPU NB nonloc.",
+        "Wait GPU NB local",
+        "Wait GPU state copy",
+        "NB X/F buffer ops.",
+        "Vsite spread",
+        "COM pull force",
+        "AWH",
+        "Write traj.",
+        "Update",
+        "Constraints",
+        "Comm. energies",
+        "Enforced rotation",
+        "Add rot. forces",
+        "Position swapping",
+        "IMD",
+        "Test"
+    };
+    return wallCycleCounterNames[enumValue];
+}
+
+static const char* enumValuetoString(WallCycleSubCounter enumValue)
+{
+    constexpr gmx::EnumerationArray<WallCycleSubCounter, const char*> wallCycleSubCounterNames = {
+        "DD redist.",
+        "DD NS grid + sort",
+        "DD setup comm.",
+        "DD make top.",
+        "DD make constr.",
+        "DD top. other",
+        "DD GPU ops.",
+        "NS grid local",
+        "NS grid non-loc.",
+        "NS search local",
+        "NS search non-loc.",
+        "Bonded F",
+        "Bonded-FEP F",
+        "Restraints F",
+        "Listed buffer ops.",
+        "Nonbonded pruning",
+        "Nonbonded F kernel",
+        "Nonbonded F clear",
+        "Nonbonded FEP",
+        "Launch NB GPU tasks",
+        "Launch Bonded GPU tasks",
+        "Launch PME GPU tasks",
+        "Launch state copy",
+        "Ewald F correction",
+        "NB X buffer ops.",
+        "NB F buffer ops.",
+        "Clear force buffer",
+        "Launch GPU NB X buffer ops.",
+        "Launch GPU NB F buffer ops.",
+        "Launch GPU Comm. coord.",
+        "Launch GPU Comm. force.",
+        "Launch GPU update",
+        "Test subcounter"
+    };
+    return wallCycleSubCounterNames[enumValue];
+}
 
 /* PME GPU timing events' names - correspond to the enum in the gpu_timing.h */
 static const char* enumValuetoString(PmeStage enumValue)
@@ -168,23 +182,22 @@ bool wallcycle_have_counter()
     return gmx_cycles_have_counter();
 }
 
-gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
+std::unique_ptr<gmx_wallcycle> wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
 {
-    gmx_wallcycle_t wc;
+    std::unique_ptr<gmx_wallcycle> wc;
 
 
     if (!wallcycle_have_counter())
     {
-        return nullptr;
+        return wc;
     }
 
-    snew(wc, 1);
+    wc = std::make_unique<gmx_wallcycle>();
 
-    wc->haveInvalidCount = FALSE;
-    wc->wc_barrier       = FALSE;
-    wc->wcc_all          = nullptr;
+    wc->haveInvalidCount = false;
+    wc->wc_barrier       = false;
     wc->wc_depth         = 0;
-    wc->ewc_prev         = -1;
+    wc->ewc_prev         = WallCycleCounter::Count;
     wc->reset_counters   = resetstep;
     wc->cr               = cr;
 
@@ -196,23 +209,17 @@ gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
         {
             fprintf(fplog, "\nWill call MPI_Barrier before each cycle start/stop call\n\n");
         }
-        wc->wc_barrier = TRUE;
+        wc->wc_barrier = true;
     }
 #endif
 
-    snew(wc->wcc, ewcNR);
     if (getenv("GMX_CYCLE_ALL") != nullptr)
     {
         if (fplog)
         {
             fprintf(fplog, "\nWill time all the code during the run\n\n");
         }
-        snew(wc->wcc_all, ewcNR * ewcNR);
-    }
-
-    if (sc_useCycleSubcounters)
-    {
-        snew(wc->wcsc, ewcsNR);
+        wc->wcc_all.resize(sc_numWallCycleCountersSquared);
     }
 
 #if DEBUG_WCYCLE
@@ -223,32 +230,10 @@ gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr)
     return wc;
 }
 
-void wallcycle_destroy(gmx_wallcycle_t wc)
-{
-    if (wc == nullptr)
-    {
-        return;
-    }
-
-    if (wc->wcc != nullptr)
-    {
-        sfree(wc->wcc);
-    }
-    if (wc->wcc_all != nullptr)
-    {
-        sfree(wc->wcc_all);
-    }
-    if (wc->wcsc != nullptr)
-    {
-        sfree(wc->wcsc);
-    }
-    sfree(wc);
-}
-
 #if DEBUG_WCYCLE
-static void debug_start_check(gmx_wallcycle_t wc, int ewc)
+static void debug_start_check(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
-    if (wc->count_depth < 0 || wc->count_depth >= DEPTH_MAX)
+    if (wc->count_depth < 0 || wc->count_depth >= c_MaxWallCycleDepth)
     {
         gmx_fatal(FARGS, "wallcycle counter depth out of range: %d", wc->count_depth + 1);
     }
@@ -258,41 +243,44 @@ static void debug_start_check(gmx_wallcycle_t wc, int ewc)
     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
     {
         std::string indentStr(4 * wc->count_depth, ' ');
-        fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
+        fprintf(stderr, "%swcycle_start depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
     }
 }
 
-static void debug_stop_check(gmx_wallcycle_t wc, int ewc)
+static void debug_stop_check(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
     if (debugPrintDepth && (!onlyMasterDebugPrints || wc->isMasterRank))
     {
         std::string indentStr(4 * wc->count_depth, ' ');
-        fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, wcn[ewc]);
+        fprintf(stderr, "%swcycle_stop  depth %d, %s\n", indentStr.c_str(), wc->count_depth, enumValuetoString(ewc));
     }
 
     wc->count_depth--;
 
     if (wc->count_depth < 0)
     {
-        gmx_fatal(FARGS, "wallcycle counter depth out of range when stopping %s: %d", wcn[ewc], wc->count_depth);
+        gmx_fatal(FARGS,
+                  "wallcycle counter depth out of range when stopping %s: %d",
+                  enumValuetoString(ewc),
+                  wc->count_depth);
     }
     if (wc->counterlist[wc->count_depth] != ewc)
     {
         gmx_fatal(FARGS,
                   "wallcycle mismatch at stop, start %s, stop %s",
-                  wcn[wc->counterlist[wc->count_depth]],
-                  wcn[ewc]);
+                  enumValuetoString(wc->counterlist[wc->count_depth]),
+                  enumValuetoString(ewc));
     }
 }
 #endif
 
-void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c)
+void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c)
 {
     *n = wc->wcc[ewc].n;
     *c = static_cast<double>(wc->wcc[ewc].c);
 }
 
-void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c)
+void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c)
 {
     if (sc_useCycleSubcounters && wc != nullptr)
     {
@@ -301,48 +289,43 @@ void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c)
     }
 }
 
-void wallcycle_reset_all(gmx_wallcycle_t wc)
+void wallcycle_reset_all(gmx_wallcycle* wc)
 {
-    int i;
-
     if (wc == nullptr)
     {
         return;
     }
 
-    for (i = 0; i < ewcNR; i++)
+    for (auto& counter : wc->wcc)
     {
-        wc->wcc[i].n = 0;
-        wc->wcc[i].c = 0;
+        counter.n = 0;
+        counter.c = 0;
     }
-    wc->haveInvalidCount = FALSE;
+    wc->haveInvalidCount = false;
 
-    if (wc->wcc_all)
+    if (!wc->wcc_all.empty())
     {
-        for (i = 0; i < ewcNR * ewcNR; i++)
+        for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
         {
             wc->wcc_all[i].n = 0;
             wc->wcc_all[i].c = 0;
         }
     }
-    if (wc->wcsc)
+    for (auto& counter : wc->wcsc)
     {
-        for (i = 0; i < ewcsNR; i++)
-        {
-            wc->wcsc[i].n = 0;
-            wc->wcsc[i].c = 0;
-        }
+        counter.n = 0;
+        counter.c = 0;
     }
 }
 
-static gmx_bool is_pme_counter(int ewc)
+static bool is_pme_counter(WallCycleCounter ewc)
 {
-    return (ewc >= ewcPMEMESH && ewc <= ewcPMEWAITCOMM);
+    return (ewc >= WallCycleCounter::PmeMesh && ewc <= WallCycleCounter::PmeWaitComm);
 }
 
-static gmx_bool is_pme_subcounter(int ewc)
+static bool is_pme_subcounter(WallCycleCounter ewc)
 {
-    return (ewc >= ewcPME_REDISTXF && ewc < ewcPMEWAITCOMM);
+    return (ewc >= WallCycleCounter::PmeRedistXF && ewc < WallCycleCounter::PmeWaitComm);
 }
 
 void wallcycleBarrier(gmx_wallcycle* wc)
@@ -358,7 +341,10 @@ void wallcycleBarrier(gmx_wallcycle* wc)
 }
 
 /* Subtract counter ewc_sub timed inside a timing block for ewc_main */
-static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
+// NOLINTNEXTLINE(google-runtime-references)
+static void subtract_cycles(gmx::EnumerationArray<WallCycleCounter, wallcc_t>& wcc,
+                            WallCycleCounter                                   ewc_main,
+                            WallCycleCounter                                   ewc_sub)
 {
     if (wcc[ewc_sub].n > 0)
     {
@@ -374,45 +360,47 @@ static void subtract_cycles(wallcc_t* wcc, int ewc_main, int ewc_sub)
     }
 }
 
-void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
+void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme)
 {
     if (wc == nullptr)
     {
         return;
     }
 
-    for (int i = 0; i < ewcNR; i++)
+    for (auto key : keysOf(wc->wcc))
     {
-        if (is_pme_counter(i) || (i == ewcRUN && isPmeRank))
+        if (is_pme_counter(key) || (key == WallCycleCounter::Run && isPmeRank))
         {
-            wc->wcc[i].c *= nthreads_pme;
+            wc->wcc[key].c *= nthreads_pme;
 
-            if (wc->wcc_all)
+            if (!wc->wcc_all.empty())
             {
-                for (int j = 0; j < ewcNR; j++)
+                const int current = static_cast<int>(key);
+                for (int j = 0; j < sc_numWallCycleCounters; j++)
                 {
-                    wc->wcc_all[i * ewcNR + j].c *= nthreads_pme;
+                    wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pme;
                 }
             }
         }
         else
         {
-            wc->wcc[i].c *= nthreads_pp;
+            wc->wcc[key].c *= nthreads_pp;
 
-            if (wc->wcc_all)
+            if (!wc->wcc_all.empty())
             {
-                for (int j = 0; j < ewcNR; j++)
+                const int current = static_cast<int>(key);
+                for (int j = 0; j < sc_numWallCycleCounters; j++)
                 {
-                    wc->wcc_all[i * ewcNR + j].c *= nthreads_pp;
+                    wc->wcc_all[current * sc_numWallCycleCounters + j].c *= nthreads_pp;
                 }
             }
         }
     }
-    if (sc_useCycleSubcounters && wc->wcsc && !isPmeRank)
+    if (sc_useCycleSubcounters && !isPmeRank)
     {
-        for (int i = 0; i < ewcsNR; i++)
+        for (auto counter : wc->wcsc)
         {
-            wc->wcsc[i].c *= nthreads_pp;
+            counter.c *= nthreads_pp;
         }
     }
 }
@@ -429,16 +417,15 @@ void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthr
  * wcc_all are unused by the GPU reporting, but it is not satisfactory
  * for the future. Also, there's no need for MPI_Allreduce, since
  * only MASTERRANK uses any of the results. */
-WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
+WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc)
 {
-    WallcycleCounts cycles_sum;
-    wallcc_t*       wcc;
-    double          cycles[int(ewcNR) + int(ewcsNR)];
+    WallcycleCounts                                    cycles_sum;
+    gmx::EnumerationArray<WallCycleCounter, double>    cyclesMain;
+    gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSub;
 #if GMX_MPI
-    double cycles_n[int(ewcNR) + int(ewcsNR) + 1];
+    gmx::EnumerationArray<WallCycleCounter, double>    cyclesMainOnNode;
+    gmx::EnumerationArray<WallCycleSubCounter, double> cyclesSubOnNode;
 #endif
-    int i;
-    int nsum;
 
     if (wc == nullptr)
     {
@@ -448,104 +435,128 @@ WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc)
         return cycles_sum;
     }
 
-    wcc = wc->wcc;
+    auto& wcc = wc->wcc;
 
-    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMLOAD);
-    subtract_cycles(wcc, ewcDOMDEC, ewcDDCOMMBOUND);
+    subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommLoad);
+    subtract_cycles(wcc, WallCycleCounter::Domdec, WallCycleCounter::DDCommBound);
 
-    subtract_cycles(wcc, ewcPME_FFT, ewcPME_FFTCOMM);
+    subtract_cycles(wcc, WallCycleCounter::PmeFft, WallCycleCounter::PmeFftComm);
 
     if (cr->npmenodes == 0)
     {
         /* All nodes do PME (or no PME at all) */
-        subtract_cycles(wcc, ewcFORCE, ewcPMEMESH);
+        subtract_cycles(wcc, WallCycleCounter::Force, WallCycleCounter::PmeMesh);
     }
     else
     {
         /* The are PME-only nodes */
-        if (wcc[ewcPMEMESH].n > 0)
+        if (wcc[WallCycleCounter::PmeMesh].n > 0)
         {
             /* This must be a PME only node, calculate the Wait + Comm. time */
-            GMX_ASSERT(wcc[ewcRUN].c >= wcc[ewcPMEMESH].c,
+            GMX_ASSERT(wcc[WallCycleCounter::Run].c >= wcc[WallCycleCounter::PmeMesh].c,
                        "Total run ticks must be greater than PME-only ticks");
-            wcc[ewcPMEWAITCOMM].c = wcc[ewcRUN].c - wcc[ewcPMEMESH].c;
+            wcc[WallCycleCounter::PmeWaitComm].c =
+                    wcc[WallCycleCounter::Run].c - wcc[WallCycleCounter::PmeMesh].c;
         }
     }
 
     /* Store the cycles in a double buffer for summing */
-    for (i = 0; i < ewcNR; i++)
+    for (auto key : keysOf(wcc))
     {
 #if GMX_MPI
-        cycles_n[i] = static_cast<double>(wcc[i].n);
+        cyclesMainOnNode[key] = static_cast<double>(wcc[key].n);
 #endif
-        cycles[i] = static_cast<double>(wcc[i].c);
+        cyclesMain[key] = static_cast<double>(wcc[key].c);
     }
-    nsum = ewcNR;
-    if (wc->wcsc)
+    if (sc_useCycleSubcounters)
     {
-        for (i = 0; i < ewcsNR; i++)
+        for (auto key : keysOf(wc->wcsc))
         {
 #if GMX_MPI
-            cycles_n[ewcNR + i] = static_cast<double>(wc->wcsc[i].n);
+            cyclesSubOnNode[key] = static_cast<double>(wc->wcsc[key].n);
 #endif
-            cycles[ewcNR + i] = static_cast<double>(wc->wcsc[i].c);
+            cyclesSub[key] = static_cast<double>(wc->wcsc[key].c);
         }
-        nsum += ewcsNR;
     }
 
 #if GMX_MPI
     if (cr->nnodes > 1)
     {
-        double buf[int(ewcNR) + int(ewcsNR) + 1];
+        gmx::EnumerationArray<WallCycleCounter, double>    bufMain;
+        gmx::EnumerationArray<WallCycleSubCounter, double> bufSub;
 
         // TODO this code is used only at the end of the run, so we
         // can just do a simple reduce of haveInvalidCount in
         // wallcycle_print, and avoid bugs
-        cycles_n[nsum] = (wc->haveInvalidCount ? 1 : 0);
+        double haveInvalidCount = (wc->haveInvalidCount ? 1 : 0);
         // TODO Use MPI_Reduce
-        MPI_Allreduce(cycles_n, buf, nsum + 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
-        for (i = 0; i < ewcNR; i++)
+        MPI_Allreduce(cyclesMainOnNode.data(), bufMain.data(), bufMain.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+        if (sc_useCycleSubcounters)
+        {
+            MPI_Allreduce(cyclesSubOnNode.data(), bufSub.data(), bufSub.size(), MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+        }
+        MPI_Allreduce(MPI_IN_PLACE, &haveInvalidCount, 1, MPI_DOUBLE, MPI_MAX, cr->mpi_comm_mysim);
+        for (auto key : keysOf(wcc))
         {
-            wcc[i].n = gmx::roundToInt(buf[i]);
+            wcc[key].n = gmx::roundToInt(bufMain[key]);
         }
-        wc->haveInvalidCount = (buf[nsum] > 0);
-        if (wc->wcsc)
+        wc->haveInvalidCount = (haveInvalidCount > 0);
+        if (sc_useCycleSubcounters)
         {
-            for (i = 0; i < ewcsNR; i++)
+            for (auto key : keysOf(wc->wcsc))
             {
-                wc->wcsc[i].n = gmx::roundToInt(buf[ewcNR + i]);
+                wc->wcsc[key].n = gmx::roundToInt(bufSub[key]);
             }
         }
 
         // TODO Use MPI_Reduce
-        MPI_Allreduce(cycles, cycles_sum.data(), nsum, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+        MPI_Allreduce(cyclesMain.data(), cycles_sum.data(), cyclesMain.size(), MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
+        if (sc_useCycleSubcounters)
+        {
+            MPI_Allreduce(cyclesSub.data(),
+                          cycles_sum.data() + sc_numWallCycleCounters,
+                          cyclesSub.size(),
+                          MPI_DOUBLE,
+                          MPI_SUM,
+                          cr->mpi_comm_mysim);
+        }
 
-        if (wc->wcc_all != nullptr)
+        if (!wc->wcc_all.empty())
         {
-            double *buf_all, *cyc_all;
+            std::array<double, sc_numWallCycleCountersSquared> cyc_all;
+            std::array<double, sc_numWallCycleCountersSquared> buf_all;
 
-            snew(cyc_all, ewcNR * ewcNR);
-            snew(buf_all, ewcNR * ewcNR);
-            for (i = 0; i < ewcNR * ewcNR; i++)
+            for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
             {
                 cyc_all[i] = wc->wcc_all[i].c;
             }
             // TODO Use MPI_Reduce
-            MPI_Allreduce(cyc_all, buf_all, ewcNR * ewcNR, MPI_DOUBLE, MPI_SUM, cr->mpi_comm_mysim);
-            for (i = 0; i < ewcNR * ewcNR; i++)
+            MPI_Allreduce(cyc_all.data(),
+                          buf_all.data(),
+                          sc_numWallCycleCountersSquared,
+                          MPI_DOUBLE,
+                          MPI_SUM,
+                          cr->mpi_comm_mysim);
+            for (int i = 0; i < sc_numWallCycleCountersSquared; i++)
             {
                 wc->wcc_all[i].c = static_cast<gmx_cycles_t>(buf_all[i]);
             }
-            sfree(buf_all);
-            sfree(cyc_all);
         }
     }
     else
 #endif
     {
-        for (i = 0; i < nsum; i++)
+        for (auto key : keysOf(cyclesMain))
         {
-            cycles_sum[i] = cycles[i];
+            cycles_sum[static_cast<int>(key)] = cyclesMain[key];
+        }
+        if (sc_useCycleSubcounters)
+        {
+            for (auto key : keysOf(cyclesSub))
+            {
+                const int offset   = static_cast<int>(key) + sc_numWallCycleCounters;
+                cycles_sum[offset] = cyclesSub[key];
+            }
         }
     }
 
@@ -669,14 +680,14 @@ void wallcycle_print(FILE*                            fplog,
                      int                              nth_pp,
                      int                              nth_pme,
                      double                           realtime,
-                     gmx_wallcycle_t                  wc,
+                     gmx_wallcycle*                   wc,
                      const WallcycleCounts&           cyc_sum,
                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t)
 {
     double      tot, tot_for_pp, tot_for_rest, tot_cpu_overlap, gpu_cpu_ratio;
     double      c2t, c2t_pp, c2t_pme = 0;
-    int         i, j, npp, nth_tot;
+    int         npp, nth_tot;
     char        buf[STRLEN];
     const char* hline =
             "-----------------------------------------------------------------------------";
@@ -699,7 +710,7 @@ void wallcycle_print(FILE*                            fplog,
     /* When using PME-only nodes, the next line is valid for both
        PP-only and PME-only nodes because they started ewcRUN at the
        same time. */
-    tot        = cyc_sum[ewcRUN];
+    tot        = cyc_sum[static_cast<int>(WallCycleCounter::Run)];
     tot_for_pp = 0;
 
     if (tot <= 0.0)
@@ -749,44 +760,63 @@ void wallcycle_print(FILE*                            fplog,
     print_header(fplog, npp, nth_pp, npme, nth_pme);
 
     fprintf(fplog, "%s\n", hline);
-    for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
+    gmx::EnumerationWrapper<WallCycleCounter> iter;
+    for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
+         key != iter.end();
+         ++key)
     {
-        if (is_pme_subcounter(i))
+
+        if (is_pme_subcounter(*key))
         {
             /* Do not count these at all */
         }
-        else if (npme > 0 && is_pme_counter(i))
+        else if (npme > 0 && is_pme_counter(*key))
         {
             /* Print timing information for PME-only nodes, but add an
              * asterisk so the reader of the table can know that the
              * walltimes are not meant to add up. The asterisk still
              * fits in the required maximum of 19 characters. */
-            char buffer[STRLEN];
-            snprintf(buffer, STRLEN, "%s *", wcn[i]);
-            print_cycles(fplog, c2t_pme, buffer, npme, nth_pme, wc->wcc[i].n, cyc_sum[i], tot);
+            std::string message = gmx::formatString("%s *", enumValuetoString(*key));
+            print_cycles(fplog,
+                         c2t_pme,
+                         message.c_str(),
+                         npme,
+                         nth_pme,
+                         wc->wcc[*key].n,
+                         cyc_sum[static_cast<int>(*key)],
+                         tot);
         }
         else
         {
             /* Print timing information when it is for a PP or PP+PME
                node */
-            print_cycles(fplog, c2t_pp, wcn[i], npp, nth_pp, wc->wcc[i].n, cyc_sum[i], tot);
-            tot_for_pp += cyc_sum[i];
+            print_cycles(fplog,
+                         c2t_pp,
+                         enumValuetoString(*key),
+                         npp,
+                         nth_pp,
+                         wc->wcc[*key].n,
+                         cyc_sum[static_cast<int>(*key)],
+                         tot);
+            tot_for_pp += cyc_sum[static_cast<int>(*key)];
         }
     }
-    if (wc->wcc_all != nullptr)
+    if (!wc->wcc_all.empty())
     {
-        for (i = 0; i < ewcNR; i++)
+        for (auto i : keysOf(wc->wcc))
         {
-            for (j = 0; j < ewcNR; j++)
+            const int countI = static_cast<int>(i);
+            for (auto j : keysOf(wc->wcc))
             {
-                snprintf(buf, 20, "%-9.9s %-9.9s", wcn[i], wcn[j]);
+                const int countJ = static_cast<int>(j);
+                snprintf(buf, 20, "%-9.9s %-9.9s", enumValuetoString(i), enumValuetoString(j));
                 print_cycles(fplog,
                              c2t_pp,
                              buf,
                              npp,
                              nth_pp,
-                             wc->wcc_all[i * ewcNR + j].n,
-                             wc->wcc_all[i * ewcNR + j].c,
+                             wc->wcc_all[countI * sc_numWallCycleCounters + countJ].n,
+                             wc->wcc_all[countI * sc_numWallCycleCounters + countJ].c,
                              tot);
             }
         }
@@ -806,16 +836,18 @@ void wallcycle_print(FILE*                            fplog,
                 hline);
     }
 
-    if (wc->wcc[ewcPMEMESH].n > 0)
+    if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
     {
         // A workaround to not print breakdown when no subcounters were recorded.
         // TODO: figure out and record PME GPU counters (what to do with the waiting ones?)
-        std::vector<int> validPmeSubcounterIndices;
-        for (i = ewcPPDURINGPME + 1; i < ewcNR; i++)
+        std::vector<WallCycleCounter> validPmeSubcounterIndices;
+        for (auto key = gmx::EnumerationIterator<WallCycleCounter>(WallCycleCounter::Domdec);
+             key != iter.end();
+             key++)
         {
-            if (is_pme_subcounter(i) && wc->wcc[i].n > 0)
+            if (is_pme_subcounter(*key) && wc->wcc[*key].n > 0)
             {
-                validPmeSubcounterIndices.push_back(i);
+                validPmeSubcounterIndices.push_back(*key);
             }
         }
 
@@ -827,24 +859,31 @@ void wallcycle_print(FILE*                            fplog,
             {
                 print_cycles(fplog,
                              npme > 0 ? c2t_pme : c2t_pp,
-                             wcn[i],
+                             enumValuetoString(i),
                              npme > 0 ? npme : npp,
                              nth_pme,
                              wc->wcc[i].n,
-                             cyc_sum[i],
+                             cyc_sum[static_cast<int>(i)],
                              tot);
             }
             fprintf(fplog, "%s\n", hline);
         }
     }
 
-    if (sc_useCycleSubcounters && wc->wcsc)
+    if (sc_useCycleSubcounters)
     {
         fprintf(fplog, " Breakdown of PP computation\n");
         fprintf(fplog, "%s\n", hline);
-        for (i = 0; i < ewcsNR; i++)
+        for (auto key : keysOf(wc->wcsc))
         {
-            print_cycles(fplog, c2t_pp, wcsn[i], npp, nth_pp, wc->wcsc[i].n, cyc_sum[ewcNR + i], tot);
+            print_cycles(fplog,
+                         c2t_pp,
+                         enumValuetoString(key),
+                         npp,
+                         nth_pp,
+                         wc->wcsc[key].n,
+                         cyc_sum[sc_numWallCycleCounters + static_cast<int>(key)],
+                         tot);
         }
         fprintf(fplog, "%s\n", hline);
     }
@@ -865,19 +904,19 @@ void wallcycle_print(FILE*                            fplog,
         tot_gpu += gpu_nbnxn_t->pl_h2d_t + gpu_nbnxn_t->nb_h2d_t + gpu_nbnxn_t->nb_d2h_t;
 
         /* add up the kernel timings */
-        for (i = 0; i < 2; i++)
+        for (int i = 0; i < 2; i++)
         {
-            for (j = 0; j < 2; j++)
+            for (int j = 0; j < 2; j++)
             {
                 tot_gpu += gpu_nbnxn_t->ktime[i][j].t;
             }
         }
         tot_gpu += gpu_nbnxn_t->pruneTime.t;
 
-        tot_cpu_overlap = wc->wcc[ewcFORCE].c;
-        if (wc->wcc[ewcPMEMESH].n > 0)
+        tot_cpu_overlap = wc->wcc[WallCycleCounter::Force].c;
+        if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
         {
-            tot_cpu_overlap += wc->wcc[ewcPMEMESH].c;
+            tot_cpu_overlap += wc->wcc[WallCycleCounter::PmeMesh].c;
         }
         tot_cpu_overlap *= realtime * 1000 / tot; /* convert s to ms */
 
@@ -889,9 +928,9 @@ void wallcycle_print(FILE*                            fplog,
         print_gputimes(fplog, "Pair list H2D", gpu_nbnxn_t->pl_h2d_c, gpu_nbnxn_t->pl_h2d_t, tot_gpu);
         print_gputimes(fplog, "X / q H2D", gpu_nbnxn_t->nb_c, gpu_nbnxn_t->nb_h2d_t, tot_gpu);
 
-        for (i = 0; i < 2; i++)
+        for (int i = 0; i < 2; i++)
         {
-            for (j = 0; j < 2; j++)
+            for (int j = 0; j < 2; j++)
             {
                 if (gpu_nbnxn_t->ktime[i][j].c)
                 {
@@ -939,18 +978,18 @@ void wallcycle_print(FILE*                            fplog,
             fprintf(fplog, "%s\n", hline);
         }
         gpu_cpu_ratio = tot_gpu / tot_cpu_overlap;
-        if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[ewcFORCE].n > 0)
+        if (gpu_nbnxn_t->nb_c > 0 && wc->wcc[WallCycleCounter::Force].n > 0)
         {
             fprintf(fplog,
                     "\nAverage per-step force GPU/CPU evaluation time ratio: %.3f ms/%.3f ms = "
                     "%.3f\n",
                     tot_gpu / gpu_nbnxn_t->nb_c,
-                    tot_cpu_overlap / wc->wcc[ewcFORCE].n,
+                    tot_cpu_overlap / wc->wcc[WallCycleCounter::Force].n,
                     gpu_cpu_ratio);
         }
 
         /* only print notes related to CPU-GPU load balance with PME */
-        if (wc->wcc[ewcPMEMESH].n > 0)
+        if (wc->wcc[WallCycleCounter::PmeMesh].n > 0)
         {
             fprintf(fplog, "For optimal resource utilization this ratio should be close to 1\n");
 
@@ -1011,10 +1050,12 @@ void wallcycle_print(FILE*                            fplog,
                         "call, so timings are not those of real runs.");
     }
 
-    if (wc->wcc[ewcNB_XF_BUF_OPS].n > 0 && (cyc_sum[ewcDOMDEC] > tot * 0.1 || cyc_sum[ewcNS] > tot * 0.1))
+    if (wc->wcc[WallCycleCounter::NbXFBufOps].n > 0
+        && (cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] > tot * 0.1
+            || cyc_sum[static_cast<int>(WallCycleCounter::NS)] > tot * 0.1))
     {
         /* Only the sim master calls this function, so always print to stderr */
-        if (wc->wcc[ewcDOMDEC].n == 0)
+        if (wc->wcc[WallCycleCounter::Domdec].n == 0)
         {
             GMX_LOG(mdlog.warning)
                     .asParagraph()
@@ -1022,7 +1063,7 @@ void wallcycle_print(FILE*                            fplog,
                             "NOTE: %d %% of the run time was spent in pair search,\n"
                             "      you might want to increase nstlist (this has no effect on "
                             "accuracy)\n",
-                            gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
         }
         else
         {
@@ -1033,38 +1074,36 @@ void wallcycle_print(FILE*                            fplog,
                             "      %d %% of the run time was spent in pair search,\n"
                             "      you might want to increase nstlist (this has no effect on "
                             "accuracy)\n",
-                            gmx::roundToInt(100 * cyc_sum[ewcDOMDEC] / tot),
-                            gmx::roundToInt(100 * cyc_sum[ewcNS] / tot));
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::Domdec)] / tot),
+                            gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::NS)] / tot));
         }
     }
 
-    if (cyc_sum[ewcMoveE] > tot * 0.05)
+    if (cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] > tot * 0.05)
     {
         GMX_LOG(mdlog.warning)
                 .asParagraph()
                 .appendTextFormatted(
                         "NOTE: %d %% of the run time was spent communicating energies,\n"
                         "      you might want to increase some nst* mdp options\n",
-                        gmx::roundToInt(100 * cyc_sum[ewcMoveE] / tot));
+                        gmx::roundToInt(100 * cyc_sum[static_cast<int>(WallCycleCounter::MoveE)] / tot));
     }
 }
 
-extern int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc)
+int64_t wcycle_get_reset_counters(gmx_wallcycle* wc)
 {
     if (wc == nullptr)
     {
         return -1;
     }
-
     return wc->reset_counters;
 }
 
-extern void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters)
+void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters)
 {
     if (wc == nullptr)
     {
         return;
     }
-
     wc->reset_counters = reset_counters;
 }
diff --git a/src/gromacs/timing/wallcycle.h b/src/gromacs/timing/wallcycle.h
index 133b90216c..f68035f2a5 100644
--- a/src/gromacs/timing/wallcycle.h
+++ b/src/gromacs/timing/wallcycle.h
@@ -43,12 +43,24 @@
 
 #include <stdio.h>
 
+#include <array>
+#include <memory>
+#include <vector>
+
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/utility/basedefinitions.h"
+#include "gromacs/utility/enumerationhelpers.h"
+
+#ifndef DEBUG_WCYCLE
+/*! \brief Enables consistency checking for the counters.
+ *
+ * If the macro is set to 1, code checks if you stop a counter different from the last
+ * one that was opened and if you do nest too deep.
+ */
+#    define DEBUG_WCYCLE 0
+#endif
 
-typedef struct gmx_wallcycle* gmx_wallcycle_t;
 struct t_commrec;
-static constexpr gmx_wallcycle* nullWallcycle = nullptr;
 
 #ifndef DEBUG_WCYCLE
 /*! \brief Enables consistency checking for the counters.
@@ -59,95 +71,98 @@ static constexpr gmx_wallcycle* nullWallcycle = nullptr;
 #    define DEBUG_WCYCLE 0
 #endif
 
-enum
+enum class WallCycleCounter : int
 {
-    ewcRUN,
-    ewcSTEP,
-    ewcPPDURINGPME,
-    ewcDOMDEC,
-    ewcDDCOMMLOAD,
-    ewcDDCOMMBOUND,
-    ewcVSITECONSTR,
-    ewcPP_PMESENDX,
-    ewcNS,
-    ewcLAUNCH_GPU,
-    ewcMOVEX,
-    ewcFORCE,
-    ewcMOVEF,
-    ewcPMEMESH,
-    ewcPME_REDISTXF,
-    ewcPME_SPREAD,
-    ewcPME_GATHER,
-    ewcPME_FFT,
-    ewcPME_FFTCOMM,
-    ewcLJPME,
-    ewcPME_SOLVE,
-    ewcPMEWAITCOMM,
-    ewcPP_PMEWAITRECVF,
-    ewcWAIT_GPU_PME_SPREAD,
-    ewcPME_FFT_MIXED_MODE,
-    ewcPME_SOLVE_MIXED_MODE,
-    ewcWAIT_GPU_PME_GATHER,
-    ewcWAIT_GPU_BONDED,
-    ewcPME_GPU_F_REDUCTION,
-    ewcWAIT_GPU_NB_NL,
-    ewcWAIT_GPU_NB_L,
-    ewcWAIT_GPU_STATE_PROPAGATOR_DATA,
-    ewcNB_XF_BUF_OPS,
-    ewcVSITESPREAD,
-    ewcPULLPOT,
-    ewcAWH,
-    ewcTRAJ,
-    ewcUPDATE,
-    ewcCONSTR,
-    ewcMoveE,
-    ewcROT,
-    ewcROTadd,
-    ewcSWAP,
-    ewcIMD,
-    ewcTEST,
-    ewcNR
+    Run,
+    Step,
+    PpDuringPme,
+    Domdec,
+    DDCommLoad,
+    DDCommBound,
+    VsiteConstr,
+    PpPmeSendX,
+    NS,
+    LaunchGpu,
+    MoveX,
+    Force,
+    MoveF,
+    PmeMesh,
+    PmeRedistXF,
+    PmeSpread,
+    PmeGather,
+    PmeFft,
+    PmeFftComm,
+    LJPme,
+    PmeSolve,
+    PmeWaitComm,
+    PpPmeWaitRecvF,
+    WaitGpuPmeSpread,
+    PmeFftMixedMode,
+    PmeSolveMixedMode,
+    WaitGpuPmeGather,
+    WaitGpuBonded,
+    PmeGpuFReduction,
+    WaitGpuNbNL,
+    WaitGpuNbL,
+    WaitGpuStatePropagatorData,
+    NbXFBufOps,
+    VsiteSpread,
+    PullPot,
+    Awh,
+    Traj,
+    Update,
+    Constr,
+    MoveE,
+    Rot,
+    RotAdd,
+    Swap,
+    Imd,
+    Test,
+    Count
 };
 
-enum
+enum class WallCycleSubCounter : int
 {
-    ewcsDD_REDIST,
-    ewcsDD_GRID,
-    ewcsDD_SETUPCOMM,
-    ewcsDD_MAKETOP,
-    ewcsDD_MAKECONSTR,
-    ewcsDD_TOPOTHER,
-    ewcsDD_GPU,
-    ewcsNBS_GRID_LOCAL,
-    ewcsNBS_GRID_NONLOCAL,
-    ewcsNBS_SEARCH_LOCAL,
-    ewcsNBS_SEARCH_NONLOCAL,
-    ewcsLISTED,
-    ewcsLISTED_FEP,
-    ewcsRESTRAINTS,
-    ewcsLISTED_BUF_OPS,
-    ewcsNONBONDED_PRUNING,
-    ewcsNONBONDED_KERNEL,
-    ewcsNONBONDED_CLEAR,
-    ewcsNONBONDED_FEP,
-    ewcsLAUNCH_GPU_NONBONDED,
-    ewcsLAUNCH_GPU_BONDED,
-    ewcsLAUNCH_GPU_PME,
-    ewcsLAUNCH_STATE_PROPAGATOR_DATA,
-    ewcsEWALD_CORRECTION,
-    ewcsNB_X_BUF_OPS,
-    ewcsNB_F_BUF_OPS,
-    ewcsCLEAR_FORCE_BUFFER,
-    ewcsLAUNCH_GPU_NB_X_BUF_OPS,
-    ewcsLAUNCH_GPU_NB_F_BUF_OPS,
-    ewcsLAUNCH_GPU_MOVEX,
-    ewcsLAUNCH_GPU_MOVEF,
-    ewcsLAUNCH_GPU_UPDATE_CONSTRAIN,
-    ewcsTEST,
-    ewcsNR
+    DDRedist,
+    DDGrid,
+    DDSetupComm,
+    DDMakeTop,
+    DDMakeConstr,
+    DDTopOther,
+    DDGpu,
+    NBSGridLocal,
+    NBSGridNonLocal,
+    NBSSearchLocal,
+    NBSSearchNonLocal,
+    Listed,
+    ListedFep,
+    Restraints,
+    ListedBufOps,
+    NonbondedPruning,
+    NonbondedKernel,
+    NonbondedClear,
+    NonbondedFep,
+    LaunchGpuNonBonded,
+    LaunchGpuBonded,
+    LaunchGpuPme,
+    LaunchStatePropagatorData,
+    EwaldCorrection,
+    NBXBufOps,
+    NBFBufOps,
+    ClearForceBuffer,
+    LaunchGpuNBXBufOps,
+    LaunchGpuNBFBufOps,
+    LaunchGpuMoveX,
+    LaunchGpuMoveF,
+    LaunchGpuUpdateConstrain,
+    Test,
+    Count
 };
 
-static constexpr const bool sc_useCycleSubcounters = GMX_CYCLE_SUBCOUNTERS;
+static constexpr int sc_numWallCycleCounters        = static_cast<int>(WallCycleCounter::Count);
+static constexpr int sc_numWallCycleSubCounters     = static_cast<int>(WallCycleSubCounter::Count);
+static constexpr int sc_numWallCycleCountersSquared = sc_numWallCycleCounters * sc_numWallCycleCounters;
+static constexpr bool sc_useCycleSubcounters        = GMX_CYCLE_SUBCOUNTERS;
 
 struct wallcc_t
 {
@@ -163,57 +178,53 @@ static constexpr int c_MaxWallCycleDepth = 6;
 
 struct gmx_wallcycle
 {
-    wallcc_t* wcc;
+    gmx::EnumerationArray<WallCycleCounter, wallcc_t> wcc;
     /* did we detect one or more invalid cycle counts */
     bool haveInvalidCount;
     /* variables for testing/debugging */
-    bool      wc_barrier;
-    wallcc_t* wcc_all;
-    int       wc_depth;
+    bool                  wc_barrier;
+    std::vector<wallcc_t> wcc_all;
+    int                   wc_depth;
 #if DEBUG_WCYCLE
-    int* counterlist;
-    int  count_depth;
-    bool isMasterRank;
+    std::array<WallCycleCounter, c_MaxWallCycleDepth> counterlist;
+    int                                               count_depth;
+    bool                                              isMasterRank;
 #endif
-    int              ewc_prev;
-    gmx_cycles_t     cycle_prev;
-    int64_t          reset_counters;
-    const t_commrec* cr;
-    wallcc_t*        wcsc;
+    WallCycleCounter                                     ewc_prev;
+    gmx_cycles_t                                         cycle_prev;
+    int64_t                                              reset_counters;
+    const t_commrec*                                     cr;
+    gmx::EnumerationArray<WallCycleSubCounter, wallcc_t> wcsc;
 };
 
-//! Returns whether cycle counting is supported.
+//! Returns if cycle counting is supported
 bool wallcycle_have_counter();
 
-/*! \brief
- * Returns a wallcycle datastructure.
- *
- * If cycle counting is not supported, returns nullptr instead.
- */
-gmx_wallcycle_t wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr);
-
-//! Cleans up wallcycle structure.
-void wallcycle_destroy(gmx_wallcycle_t wc);
+//! Returns the wall cycle structure.
+std::unique_ptr<gmx_wallcycle> wallcycle_init(FILE* fplog, int resetstep, const t_commrec* cr);
 
 //! Adds custom barrier for wallcycle counting.
 void wallcycleBarrier(gmx_wallcycle* wc);
 
-inline void wallcycle_all_start(gmx_wallcycle* wc, int ewc, gmx_cycles_t cycle)
+void wallcycle_sub_get(gmx_wallcycle* wc, WallCycleSubCounter ewcs, int* n, double* c);
+/* Returns the cumulative count and sub cycle count for ewcs */
+
+inline void wallcycle_all_start(gmx_wallcycle* wc, WallCycleCounter ewc, gmx_cycles_t cycle)
 {
     wc->ewc_prev   = ewc;
     wc->cycle_prev = cycle;
 }
 
-inline void wallcycle_all_stop(gmx_wallcycle* wc, int ewc, gmx_cycles_t cycle)
+inline void wallcycle_all_stop(gmx_wallcycle* wc, WallCycleCounter ewc, gmx_cycles_t cycle)
 {
-    const int prev    = wc->ewc_prev;
-    const int current = ewc;
-    wc->wcc_all[prev * ewcNR + current].n += 1;
-    wc->wcc_all[prev * ewcNR + current].c += cycle - wc->cycle_prev;
+    const int prev    = static_cast<int>(wc->ewc_prev);
+    const int current = static_cast<int>(ewc);
+    wc->wcc_all[prev * sc_numWallCycleCounters + current].n += 1;
+    wc->wcc_all[prev * sc_numWallCycleCounters + current].c += cycle - wc->cycle_prev;
 }
 
-//! Starts the cycle counter for \c ewc (and increases the call count).
-inline void wallcycle_start(gmx_wallcycle_t wc, int ewc)
+//! Starts the cycle counter (and increases the call count)
+inline void wallcycle_start(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
     if (wc == nullptr)
     {
@@ -227,10 +238,10 @@ inline void wallcycle_start(gmx_wallcycle_t wc, int ewc)
 #endif
     gmx_cycles_t cycle = gmx_cycles_read();
     wc->wcc[ewc].start = cycle;
-    if (wc->wcc_all)
+    if (!wc->wcc_all.empty())
     {
         wc->wc_depth++;
-        if (ewc == ewcRUN)
+        if (ewc == WallCycleCounter::Run)
         {
             wallcycle_all_start(wc, ewc, cycle);
         }
@@ -241,8 +252,8 @@ inline void wallcycle_start(gmx_wallcycle_t wc, int ewc)
     }
 }
 
-//! Starts the cycle counter for \c ewc without increasing the call count.
-inline void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
+//! Starts the cycle counter without increasing the call count
+inline void wallcycle_start_nocount(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
     if (wc == nullptr)
     {
@@ -251,8 +262,8 @@ inline void wallcycle_start_nocount(gmx_wallcycle_t wc, int ewc)
     wc->wcc[ewc].n++;
 }
 
-//! Stop the cycle count for \c ewc, returns the last cycle count.
-inline double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
+//! Stop the cycle count for ewc , returns the last cycle count
+inline double wallcycle_stop(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
     gmx_cycles_t cycle, last;
 
@@ -287,10 +298,10 @@ inline double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
     }
     wc->wcc[ewc].c += last;
     wc->wcc[ewc].n++;
-    if (wc->wcc_all)
+    if (!wc->wcc_all.empty())
     {
         wc->wc_depth--;
-        if (ewc == ewcRUN)
+        if (ewc == WallCycleCounter::Run)
         {
             wallcycle_all_stop(wc, ewc, cycle);
         }
@@ -303,8 +314,8 @@ inline double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
     return last;
 }
 
-//! Only increment call count for \c ewc by one.
-inline void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
+//! Only increment call count for ewc by one
+inline void wallcycle_increment_event_count(gmx_wallcycle* wc, WallCycleCounter ewc)
 {
     if (wc == nullptr)
     {
@@ -313,26 +324,23 @@ inline void wallcycle_increment_event_count(gmx_wallcycle_t wc, int ewc)
     wc->wcc[ewc].n++;
 }
 
-//! Returns the cumulative count and cycle count for \c ewc.
-void wallcycle_get(gmx_wallcycle_t wc, int ewc, int* n, double* c);
-
-//! Returns the cumulative count and sub cycle count for \c ewcs.
-void wallcycle_sub_get(gmx_wallcycle_t wc, int ewcs, int* n, double* c);
+//! Returns the cumulative count and cycle count for ewc
+void wallcycle_get(gmx_wallcycle* wc, WallCycleCounter ewc, int* n, double* c);
 
-//! Resets all cycle counters to zero.
-void wallcycle_reset_all(gmx_wallcycle_t wc);
+//! Resets all cycle counters to zero
+void wallcycle_reset_all(gmx_wallcycle* wc);
 
-//! Scale the cycle counts to reflect how many threads run for that number of cycles.
-void wallcycle_scale_by_num_threads(gmx_wallcycle_t wc, bool isPmeRank, int nthreads_pp, int nthreads_pme);
+//! Scale the cycle counts to reflect how many threads run for that number of cycles
+void wallcycle_scale_by_num_threads(gmx_wallcycle* wc, bool isPmeRank, int nthreads_pp, int nthreads_pme);
 
-//! Return reset_counters.
-int64_t wcycle_get_reset_counters(gmx_wallcycle_t wc);
+//! Return reset_counters from wc struct
+int64_t wcycle_get_reset_counters(gmx_wallcycle* wc);
 
-//! Set reset_counters.
-void wcycle_set_reset_counters(gmx_wallcycle_t wc, int64_t reset_counters);
+//! Set reset_counters
+void wcycle_set_reset_counters(gmx_wallcycle* wc, int64_t reset_counters);
 
-//! Set the start sub cycle count for \c ewcs.
-inline void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
+//! Set the start sub cycle count for ewcs
+inline void wallcycle_sub_start(gmx_wallcycle* wc, WallCycleSubCounter ewcs)
 {
     if (sc_useCycleSubcounters && wc != nullptr)
     {
@@ -340,8 +348,8 @@ inline void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
     }
 }
 
-//! Set the start sub cycle count for \c ewcs without increasing the call count.
-inline void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
+//! Set the start sub cycle count for ewcs without increasing the call count
+inline void wallcycle_sub_start_nocount(gmx_wallcycle* wc, WallCycleSubCounter ewcs)
 {
     if (sc_useCycleSubcounters && wc != nullptr)
     {
@@ -349,8 +357,8 @@ inline void wallcycle_sub_start_nocount(gmx_wallcycle_t wc, int ewcs)
     }
 }
 
-//! Stop the sub cycle count for \c ewcs.
-inline void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
+//! Stop the sub cycle count for ewcs
+inline void wallcycle_sub_stop(gmx_wallcycle* wc, WallCycleSubCounter ewcs)
 {
     if (sc_useCycleSubcounters && wc != nullptr)
     {
diff --git a/src/gromacs/timing/wallcyclereporting.h b/src/gromacs/timing/wallcyclereporting.h
index 1bf3096fea..950b5b8af6 100644
--- a/src/gromacs/timing/wallcyclereporting.h
+++ b/src/gromacs/timing/wallcyclereporting.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2008, The GROMACS development team.
  * Copyright (c) 2013,2014,2015,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -45,6 +45,7 @@
 
 #include <array>
 
+#include "gromacs/timing/wallcycle.h"
 #include "gromacs/utility/basedefinitions.h"
 
 struct t_commrec;
@@ -54,14 +55,13 @@ namespace gmx
 class MDLogger;
 }
 
-typedef struct gmx_wallcycle* gmx_wallcycle_t;
 struct gmx_wallclock_gpu_nbnxn_t;
 struct gmx_wallclock_gpu_pme_t;
 
-typedef std::array<double, int(ewcNR) + int(ewcsNR)> WallcycleCounts;
+using WallcycleCounts = std::array<double, sc_numWallCycleCounters + sc_numWallCycleSubCounters>;
 /* Convenience typedef */
 
-WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle_t wc);
+WallcycleCounts wallcycle_sum(const t_commrec* cr, gmx_wallcycle* wc);
 /* Return a vector of the sum of cycle counts over the nodes in
    cr->mpi_comm_mysim. */
 
@@ -72,7 +72,7 @@ void wallcycle_print(FILE*                            fplog,
                      int                              nth_pp,
                      int                              nth_pme,
                      double                           realtime,
-                     gmx_wallcycle_t                  wc,
+                     gmx_wallcycle*                   wc,
                      const WallcycleCounts&           cyc_sum,
                      const gmx_wallclock_gpu_nbnxn_t* gpu_nbnxn_t,
                      const gmx_wallclock_gpu_pme_t*   gpu_pme_t);
-- 
2.22.0