From 42bea84075544680a2a23cbcda62b326271d7d4c Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Tue, 30 Mar 2021 20:21:13 +0000
Subject: [PATCH] Move dynamic pull groups and pull nthreads

Moved dynamic pull groups into their respective coordinates.
Set the number of threads per pull group instead of globally.
Now all thread parallel regions run on 1 thread with less than 100
local atoms.
---
 src/gromacs/pulling/output.cpp      |   4 +-
 src/gromacs/pulling/pull.cpp        | 143 ++++++++++++++--------------
 src/gromacs/pulling/pull_internal.h |  26 +++--
 src/gromacs/pulling/pullutil.cpp    | 122 ++++++++++++------------
 4 files changed, 152 insertions(+), 143 deletions(-)
diff --git a/src/gromacs/pulling/output.cpp b/src/gromacs/pulling/output.cpp
index 691a248872..220b2d6063 100644
--- a/src/gromacs/pulling/output.cpp
+++ b/src/gromacs/pulling/output.cpp
@@ -95,7 +95,7 @@ static void addToPullxHistory(pull_t* pull)
         {
             for (int m = 0; m < DIM; m++)
             {
-                pcrdHistory.dynaX[m] += pull->dyna[c].x[m];
+                pcrdHistory.dynaX[m] += pcrd.dynamicGroup0->x[m];
             }
         }
     }
@@ -236,7 +236,7 @@ static void pull_print_x(FILE* out, pull_t* pull, double t)
                         }
                         else
                         {
-                            fprintf(out, "\t%g", pull->dyna[c].x[m]);
+                            fprintf(out, "\t%g", pcrd.dynamicGroup0->x[m]);
                         }
                     }
                 }
diff --git a/src/gromacs/pulling/pull.cpp b/src/gromacs/pulling/pull.cpp
index 388c563f0c..6d5ea22148 100644
--- a/src/gromacs/pulling/pull.cpp
+++ b/src/gromacs/pulling/pull.cpp
@@ -89,6 +89,8 @@ namespace gmx
 extern template LocalAtomSet LocalAtomSetManager::add<void, void>(ArrayRef<const int> globalAtomIndex);
 } // namespace gmx
 
+using gmx::ArrayRef;
+
 static int groupPbcFromParams(const t_pull_group& params, bool setPbcRefToPrevStepCOM)
 {
     if (params.ind.size() <= 1)
@@ -120,9 +122,11 @@ static int groupPbcFromParams(const t_pull_group& params, bool setPbcRefToPrevSt
  */
 pull_group_work_t::pull_group_work_t(const t_pull_group& params,
                                      gmx::LocalAtomSet   atomSet,
-                                     bool                bSetPbcRefToPrevStepCOM) :
+                                     bool                bSetPbcRefToPrevStepCOM,
+                                     int                 maxNumThreads) :
     params(params),
     epgrppbc(groupPbcFromParams(params, bSetPbcRefToPrevStepCOM)),
+    maxNumThreads(maxNumThreads),
     needToCalcCom(false),
     atomSet(atomSet),
     mwscale(0),
@@ -176,24 +180,24 @@ double pull_conversion_factor_internal2userinput(const t_pull_coord* pcrd)
 }
 
 /* Apply forces in a mass weighted fashion for part of the pull group */
-static void apply_forces_grp_part(const pull_group_work_t*  pgrp,
+static void apply_forces_grp_part(const pull_group_work_t&  pgrp,
                                   int                       ind_start,
                                   int                       ind_end,
                                   gmx::ArrayRef<const real> masses,
                                   const dvec                f_pull,
-                                  int                       sign,
+                                  const int                 sign,
                                   rvec*                     f)
 {
-    double inv_wm = pgrp->mwscale;
+    const double inv_wm = pgrp.mwscale;
 
-    auto localAtomIndices = pgrp->atomSet.localIndex();
+    auto localAtomIndices = pgrp.atomSet.localIndex();
     for (int i = ind_start; i < ind_end; i++)
     {
         int    ii    = localAtomIndices[i];
         double wmass = masses[ii];
-        if (!pgrp->localWeights.empty())
+        if (!pgrp.localWeights.empty())
         {
-            wmass *= pgrp->localWeights[i];
+            wmass *= pgrp.localWeights[i];
         }
 
         for (int d = 0; d < DIM; d++)
@@ -204,16 +208,15 @@ static void apply_forces_grp_part(const pull_group_work_t*  pgrp,
 }
 
 /* Apply forces in a mass weighted fashion */
-static void apply_forces_grp(const pull_group_work_t*  pgrp,
+static void apply_forces_grp(const pull_group_work_t&  pgrp,
                              gmx::ArrayRef<const real> masses,
                              const dvec                f_pull,
-                             int                       sign,
-                             rvec*                     f,
-                             int                       nthreads)
+                             const int                 sign,
+                             rvec*                     f)
 {
-    auto localAtomIndices = pgrp->atomSet.localIndex();
+    auto localAtomIndices = pgrp.atomSet.localIndex();
 
-    if (pgrp->params.ind.size() == 1 && pgrp->atomSet.numAtomsLocal() == 1)
+    if (pgrp.params.ind.size() == 1 && pgrp.atomSet.numAtomsLocal() == 1)
     {
         /* Only one atom and our rank has this atom: we can skip
          * the mass weighting, which means that this code also works
@@ -226,17 +229,18 @@ static void apply_forces_grp(const pull_group_work_t*  pgrp,
     }
     else
     {
-        if (localAtomIndices.size() <= c_pullMaxNumLocalAtomsSingleThreaded)
+        const int numThreads = pgrp.numThreads();
+        if (numThreads == 1)
         {
             apply_forces_grp_part(pgrp, 0, localAtomIndices.size(), masses, f_pull, sign, f);
         }
         else
         {
-#pragma omp parallel for num_threads(nthreads) schedule(static)
-            for (int th = 0; th < nthreads; th++)
+#pragma omp parallel for num_threads(numThreads) schedule(static)
+            for (int th = 0; th < numThreads; th++)
             {
-                int ind_start = (localAtomIndices.size() * (th + 0)) / nthreads;
-                int ind_end   = (localAtomIndices.size() * (th + 1)) / nthreads;
+                int ind_start = (localAtomIndices.size() * (th + 0)) / numThreads;
+                int ind_end   = (localAtomIndices.size() * (th + 1)) / numThreads;
                 apply_forces_grp_part(pgrp, ind_start, ind_end, masses, f_pull, sign, f);
             }
         }
@@ -244,27 +248,27 @@ static void apply_forces_grp(const pull_group_work_t*  pgrp,
 }
 
 /* Apply forces in a mass weighted fashion to a cylinder group */
-static void apply_forces_cyl_grp(const pull_group_work_t*  pgrp,
+static void apply_forces_cyl_grp(const pull_group_work_t&  pgrp,
                                  const double              dv_corr,
                                  gmx::ArrayRef<const real> masses,
                                  const dvec                f_pull,
                                  double                    f_scal,
-                                 int                       sign,
-                                 rvec*                     f,
-                                 int gmx_unused nthreads)
+                                 const int                 sign,
+                                 rvec*                     f)
 {
-    double inv_wm = pgrp->mwscale;
+    const double inv_wm = pgrp.mwscale;
 
-    auto localAtomIndices = pgrp->atomSet.localIndex();
+    auto localAtomIndices = pgrp.atomSet.localIndex();
 
     /* The cylinder group is always a slab in the system, thus large.
      * Therefore we always thread-parallelize this group.
      */
-    int numAtomsLocal = localAtomIndices.size();
-#pragma omp parallel for num_threads(nthreads) schedule(static)
+    const int numAtomsLocal         = localAtomIndices.size();
+    const int gmx_unused numThreads = pgrp.numThreads();
+#pragma omp parallel for num_threads(numThreads) schedule(static)
     for (int i = 0; i < numAtomsLocal; i++)
     {
-        double weight = pgrp->localWeights[i];
+        double weight = pgrp.localWeights[i];
         if (weight == 0)
         {
             continue;
@@ -275,7 +279,7 @@ static void apply_forces_cyl_grp(const pull_group_work_t*  pgrp,
          * to be corrected for an offset (dv_corr), which was unknown when
          * we calculated dv.
          */
-        double dv_com = pgrp->dv[i] + dv_corr;
+        double dv_com = pgrp.dv[i] + dv_corr;
 
         /* Here we not only add the pull force working along vec (f_pull),
          * but also a radial component, due to the dependence of the weights
@@ -283,7 +287,7 @@ static void apply_forces_cyl_grp(const pull_group_work_t*  pgrp,
          */
         for (int m = 0; m < DIM; m++)
         {
-            f[ii][m] += sign * inv_wm * (mass * weight * f_pull[m] + pgrp->mdw[i][m] * dv_com * f_scal);
+            f[ii][m] += sign * inv_wm * (mass * weight * f_pull[m] + pgrp.mdw[i][m] * dv_com * f_scal);
         }
     }
 }
@@ -291,12 +295,12 @@ static void apply_forces_cyl_grp(const pull_group_work_t*  pgrp,
 /* Apply torque forces in a mass weighted fashion to the groups that define
  * the pull vector direction for pull coordinate pcrd.
  */
-static void apply_forces_vec_torque(const struct pull_t*      pull,
-                                    const pull_coord_work_t*  pcrd,
-                                    gmx::ArrayRef<const real> masses,
-                                    rvec*                     f)
+static void apply_forces_vec_torque(const pull_coord_work_t&          pcrd,
+                                    ArrayRef<const pull_group_work_t> pullGroups,
+                                    gmx::ArrayRef<const real>         masses,
+                                    rvec*                             f)
 {
-    const PullCoordSpatialData& spatialData = pcrd->spatialData;
+    const PullCoordSpatialData& spatialData = pcrd.spatialData;
 
     /* The component inpr along the pull vector is accounted for in the usual
      * way. Here we account for the component perpendicular to vec.
@@ -315,20 +319,20 @@ static void apply_forces_vec_torque(const struct pull_t*      pull,
     dvec f_perp;
     for (int m = 0; m < DIM; m++)
     {
-        f_perp[m] = (spatialData.dr01[m] - inpr * spatialData.vec[m]) / spatialData.vec_len * pcrd->scalarForce;
+        f_perp[m] = (spatialData.dr01[m] - inpr * spatialData.vec[m]) / spatialData.vec_len * pcrd.scalarForce;
     }
 
     /* Apply the force to the groups defining the vector using opposite signs */
-    apply_forces_grp(&pull->group[pcrd->params.group[2]], masses, f_perp, -1, f, pull->nthreads);
-    apply_forces_grp(&pull->group[pcrd->params.group[3]], masses, f_perp, 1, f, pull->nthreads);
+    apply_forces_grp(pullGroups[pcrd.params.group[2]], masses, f_perp, -1, f);
+    apply_forces_grp(pullGroups[pcrd.params.group[3]], masses, f_perp, 1, f);
 }
 
 /* Apply forces in a mass weighted fashion */
-static void apply_forces_coord(pull_t*                      pull,
-                               const pull_coord_work_t&     pcrd,
-                               const PullCoordVectorForces& forces,
-                               gmx::ArrayRef<const real>    masses,
-                               rvec*                        f)
+static void apply_forces_coord(const pull_coord_work_t&          pcrd,
+                               ArrayRef<const pull_group_work_t> pullGroups,
+                               const PullCoordVectorForces&      forces,
+                               gmx::ArrayRef<const real>         masses,
+                               rvec*                             f)
 {
     /* Here it would be more efficient to use one large thread-parallel
      * region instead of potential parallel regions within apply_forces_grp.
@@ -338,14 +342,8 @@ static void apply_forces_coord(pull_t*                      pull,
 
     if (pcrd.params.eGeom == PullGroupGeometry::Cylinder)
     {
-        apply_forces_cyl_grp(&pull->dyna[pcrd.params.coordIndex],
-                             pcrd.spatialData.cyl_dev,
-                             masses,
-                             forces.force01,
-                             pcrd.scalarForce,
-                             -1,
-                             f,
-                             pull->nthreads);
+        apply_forces_cyl_grp(
+                *pcrd.dynamicGroup0, pcrd.spatialData.cyl_dev, masses, forces.force01, pcrd.scalarForce, -1, f);
 
         /* Sum the force along the vector and the radial force */
         dvec f_tot;
@@ -353,7 +351,7 @@ static void apply_forces_coord(pull_t*                      pull,
         {
             f_tot[m] = forces.force01[m] + pcrd.scalarForce * pcrd.spatialData.ffrad[m];
         }
-        apply_forces_grp(&pull->group[pcrd.params.group[1]], masses, f_tot, 1, f, pull->nthreads);
+        apply_forces_grp(pullGroups[pcrd.params.group[1]], masses, f_tot, 1, f);
     }
     else
     {
@@ -362,27 +360,24 @@ static void apply_forces_coord(pull_t*                      pull,
             /* We need to apply the torque forces to the pull groups
              * that define the pull vector.
              */
-            apply_forces_vec_torque(pull, &pcrd, masses, f);
+            apply_forces_vec_torque(pcrd, pullGroups, masses, f);
         }
 
-        if (!pull->group[pcrd.params.group[0]].params.ind.empty())
+        if (!pullGroups[pcrd.params.group[0]].params.ind.empty())
         {
-            apply_forces_grp(
-                    &pull->group[pcrd.params.group[0]], masses, forces.force01, -1, f, pull->nthreads);
+            apply_forces_grp(pullGroups[pcrd.params.group[0]], masses, forces.force01, -1, f);
         }
-        apply_forces_grp(&pull->group[pcrd.params.group[1]], masses, forces.force01, 1, f, pull->nthreads);
+        apply_forces_grp(pullGroups[pcrd.params.group[1]], masses, forces.force01, 1, f);
 
         if (pcrd.params.ngroup >= 4)
         {
-            apply_forces_grp(
-                    &pull->group[pcrd.params.group[2]], masses, forces.force23, -1, f, pull->nthreads);
-            apply_forces_grp(&pull->group[pcrd.params.group[3]], masses, forces.force23, 1, f, pull->nthreads);
+            apply_forces_grp(pullGroups[pcrd.params.group[2]], masses, forces.force23, -1, f);
+            apply_forces_grp(pullGroups[pcrd.params.group[3]], masses, forces.force23, 1, f);
         }
         if (pcrd.params.ngroup >= 6)
         {
-            apply_forces_grp(
-                    &pull->group[pcrd.params.group[4]], masses, forces.force45, -1, f, pull->nthreads);
-            apply_forces_grp(&pull->group[pcrd.params.group[5]], masses, forces.force45, 1, f, pull->nthreads);
+            apply_forces_grp(pullGroups[pcrd.params.group[4]], masses, forces.force45, -1, f);
+            apply_forces_grp(pullGroups[pcrd.params.group[5]], masses, forces.force45, 1, f);
         }
     }
 }
@@ -607,7 +602,7 @@ static void get_pull_coord_dr(const pull_t& pull, pull_coord_work_t* pcrd, const
             pcrd,
             pbc,
             pgrp1.x,
-            pcrd->params.eGeom == PullGroupGeometry::Cylinder ? pull.dyna[coord_ind].x : pgrp0.x,
+            pcrd->params.eGeom == PullGroupGeometry::Cylinder ? pcrd->dynamicGroup0->x : pgrp0.x,
             0,
             1,
             md2,
@@ -1569,7 +1564,7 @@ void apply_external_pull_coord_force(struct pull_t*            pull,
         }
 
         apply_forces_coord(
-                pull, *pcrd, pullCoordForces, masses, as_rvec_array(forceWithVirial->force_.data()));
+                *pcrd, pull->group, pullCoordForces, masses, as_rvec_array(forceWithVirial->force_.data()));
     }
 
     pull->numExternalPotentialsStillToBeAppliedThisStep--;
@@ -1646,7 +1641,7 @@ real pull_potential(struct pull_t*                 pull,
                     *pull, pcrd, pbc, t, lambda, &V, computeVirial ? virial : nullptr, &dVdl);
 
             /* Distribute the force over the atoms in the pulled groups */
-            apply_forces_coord(pull, *pcrd, pullCoordForces, masses, f);
+            apply_forces_coord(*pcrd, pull->group, pullCoordForces, masses, f);
         }
 
         if (MASTER(cr))
@@ -1993,11 +1988,15 @@ struct pull_t* init_pull(FILE*                     fplog,
     /* Copy the pull parameters */
     pull->params = *pull_params;
 
+    /* The gmx_omp_nthreads module might not be initialized here, so max(1,) */
+    const int maxNumThreads = std::max(1, gmx_omp_nthreads_get(emntDefault));
+
     for (int i = 0; i < pull_params->ngroup; ++i)
     {
         pull->group.emplace_back(pull_params->group[i],
                                  atomSets->add(pull_params->group[i].ind),
-                                 pull_params->bSetPbcRefToPrevStepCOM);
+                                 pull_params->bSetPbcRefToPrevStepCOM,
+                                 maxNumThreads);
     }
 
     if (cr != nullptr && DOMAINDECOMP(cr))
@@ -2323,7 +2322,7 @@ struct pull_t* init_pull(FILE*                     fplog,
     /* If we use cylinder coordinates, do some initialising for them */
     if (pull->bCylinder)
     {
-        for (const pull_coord_work_t& coord : pull->coord)
+        for (pull_coord_work_t& coord : pull->coord)
         {
             if (coord.params.eGeom == PullGroupGeometry::Cylinder)
             {
@@ -2334,15 +2333,13 @@ struct pull_t* init_pull(FILE*                     fplog,
                               "reference!\n");
                 }
             }
-            const auto& referenceGroup = pull->group[coord.params.group[0]];
-            pull->dyna.emplace_back(
-                    referenceGroup.params, referenceGroup.atomSet, pull->params.bSetPbcRefToPrevStepCOM);
+            const auto& group0  = pull->group[coord.params.group[0]];
+            coord.dynamicGroup0 = std::make_unique<pull_group_work_t>(
+                    group0.params, group0.atomSet, pull->params.bSetPbcRefToPrevStepCOM, maxNumThreads);
         }
     }
 
-    /* The gmx_omp_nthreads module might not be initialized here, so max(1,) */
-    pull->nthreads = std::max(1, gmx_omp_nthreads_get(emntDefault));
-    pull->comSums.resize(pull->nthreads);
+    pull->comSums.resize(maxNumThreads);
 
     comm = &pull->comm;
 
diff --git a/src/gromacs/pulling/pull_internal.h b/src/gromacs/pulling/pull_internal.h
index 1973e5f116..749d70379c 100644
--- a/src/gromacs/pulling/pull_internal.h
+++ b/src/gromacs/pulling/pull_internal.h
@@ -4,7 +4,7 @@
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2004, The GROMACS development team.
  * Copyright (c) 2013,2014,2015,2016,2018 by the GROMACS development team.
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -91,14 +91,25 @@ struct pull_group_work_t
      * \param[in] params                  The group parameters set by the user
      * \param[in] atomSet                 The global to local atom set manager
      * \param[in] setPbcRefToPrevStepCOM Does this pull group use the COM from the previous step as reference position?
+     * \param[in] maxNumThreads           Use either this number of threads of 1 for operations on x and f
      */
-    pull_group_work_t(const t_pull_group& params, gmx::LocalAtomSet atomSet, bool setPbcRefToPrevStepCOM);
+    pull_group_work_t(const t_pull_group& params,
+                      gmx::LocalAtomSet   atomSet,
+                      bool                setPbcRefToPrevStepCOM,
+                      int                 maxNumThreads);
+
+    //! Returns the number of threads to use for local atom operations based on the local atom count
+    int numThreads() const
+    {
+        return atomSet.numAtomsLocal() <= c_pullMaxNumLocalAtomsSingleThreaded ? 1 : maxNumThreads;
+    }
 
     /* Data only modified at initialization */
     const t_pull_group params;   /**< The pull group parameters */
     const int          epgrppbc; /**< The type of pbc for this pull group, see enum above */
-    bool               needToCalcCom; /**< Do we need to calculate the COM? (Not for group 0 or if only used as cylinder group) */
-    std::vector<real>  globalWeights; /**< Weights per atom set by the user and/or mass/friction coefficients, if empty all weights are equal */
+    const int maxNumThreads; /**< The maximum number of threads to use for operations on x and f */
+    bool      needToCalcCom; /**< Do we need to calculate the COM? (Not for group 0 or if only used as cylinder group) */
+    std::vector<real> globalWeights; /**< Weights per atom set by the user and/or mass/friction coefficients, if empty all weights are equal */
 
     /* Data modified only at init or at domain decomposition */
     gmx::LocalAtomSet                  atomSet;      /**< Global to local atom set mapper */
@@ -149,6 +160,9 @@ struct pull_coord_work_t
 
     const t_pull_coord params; /* Pull coordinate parameters */
 
+    /* Dynamic pull group 0 for this coordinate with dynamic weights, only present when needed */
+    std::unique_ptr<pull_group_work_t> dynamicGroup0;
+
     double value_ref; /* The reference value, usually init+rate*t, units of nm or rad */
 
     PullCoordSpatialData spatialData; /* Data defining the current geometry */
@@ -236,7 +250,6 @@ struct pull_t
 
     /* Parameters + dynamic data for groups */
     std::vector<pull_group_work_t> group; /* The pull group param and work data */
-    std::vector<pull_group_work_t> dyna;  /* Dynamic groups for geom=cylinder */
 
     /* Parameters + dynamic data for coordinates */
     std::vector<pull_coord_work_t> coord; /* The pull group param and work data */
@@ -244,8 +257,7 @@ struct pull_t
     /* Global dynamic data */
     gmx_bool bSetPBCatoms; /* Do we need to set x_pbc for the groups? */
 
-    int                  nthreads; /* Number of threads used by the pull code */
-    std::vector<ComSums> comSums;  /* Work array for summing for COM, 1 entry per thread */
+    std::vector<ComSums> comSums; /* Work array for summing for COM, 1 entry per thread */
 
     pull_comm_t comm; /* Communication parameters, communicator and buffers */
 
diff --git a/src/gromacs/pulling/pullutil.cpp b/src/gromacs/pulling/pullutil.cpp
index b9dbb2d68e..d332a93bc7 100644
--- a/src/gromacs/pulling/pullutil.cpp
+++ b/src/gromacs/pulling/pullutil.cpp
@@ -186,13 +186,11 @@ static void make_cyl_refgrps(const t_commrec*               cr,
     double inv_cyl_r2 = 1.0 / gmx::square(pull->params.cylinder_r);
 
     /* loop over all groups to make a reference group for each*/
-    for (size_t c = 0; c < pull->coord.size(); c++)
+    int bufferOffset = 0;
+    for (pull_coord_work_t& pcrd : pull->coord)
     {
-        pull_coord_work_t* pcrd;
-        double             sum_a, wmass, wwmass;
-        dvec               radf_fac0, radf_fac1;
-
-        pcrd = &pull->coord[c];
+        double sum_a, wmass, wwmass;
+        dvec   radf_fac0, radf_fac1;
 
         sum_a  = 0;
         wmass  = 0;
@@ -200,14 +198,14 @@ static void make_cyl_refgrps(const t_commrec*               cr,
         clear_dvec(radf_fac0);
         clear_dvec(radf_fac1);
 
-        if (pcrd->params.eGeom == PullGroupGeometry::Cylinder)
+        if (pcrd.params.eGeom == PullGroupGeometry::Cylinder)
         {
             /* pref will be the same group for all pull coordinates */
-            const pull_group_work_t& pref  = pull->group[pcrd->params.group[0]];
-            const pull_group_work_t& pgrp  = pull->group[pcrd->params.group[1]];
-            pull_group_work_t&       pdyna = pull->dyna[c];
+            const pull_group_work_t& pref  = pull->group[pcrd.params.group[0]];
+            const pull_group_work_t& pgrp  = pull->group[pcrd.params.group[1]];
+            pull_group_work_t&       pdyna = *pcrd.dynamicGroup0;
             rvec                     direction;
-            copy_dvec_to_rvec(pcrd->spatialData.vec, direction);
+            copy_dvec_to_rvec(pcrd.spatialData.vec, direction);
 
             /* Since we have not calculated the COM of the cylinder group yet,
              * we calculate distances with respect to location of the pull
@@ -215,15 +213,15 @@ static void make_cyl_refgrps(const t_commrec*               cr,
              * here we already have the COM of the pull group. This resolves
              * any PBC issues and we don't need to use a PBC-atom here.
              */
-            if (pcrd->params.rate != 0)
+            if (pcrd.params.rate != 0)
             {
                 /* With rate=0, value_ref is set initially */
-                pcrd->value_ref = pcrd->params.init + pcrd->params.rate * t;
+                pcrd.value_ref = pcrd.params.init + pcrd.params.rate * t;
             }
             rvec reference;
             for (int m = 0; m < DIM; m++)
             {
-                reference[m] = pgrp.x[m] - pcrd->spatialData.vec[m] * pcrd->value_ref;
+                reference[m] = pgrp.x[m] - pcrd.spatialData.vec[m] * pcrd.value_ref;
             }
 
             auto localAtomIndices = pref.atomSet.localIndex();
@@ -290,8 +288,9 @@ static void make_cyl_refgrps(const t_commrec*               cr,
             }
         }
 
-        auto buffer = gmx::arrayRefFromArray(
-                comm->cylinderBuffer.data() + c * c_cylinderBufferStride, c_cylinderBufferStride);
+        auto buffer = gmx::arrayRefFromArray(comm->cylinderBuffer.data() + bufferOffset,
+                                             c_cylinderBufferStride);
+        bufferOffset += c_cylinderBufferStride;
 
         buffer[0] = wmass;
         buffer[1] = wwmass;
@@ -312,28 +311,26 @@ static void make_cyl_refgrps(const t_commrec*               cr,
         pullAllReduce(cr, comm, pull->coord.size() * c_cylinderBufferStride, comm->cylinderBuffer.data());
     }
 
-    for (size_t c = 0; c < pull->coord.size(); c++)
+    bufferOffset = 0;
+    for (pull_coord_work_t& pcrd : pull->coord)
     {
-        pull_coord_work_t* pcrd;
-
-        pcrd = &pull->coord[c];
-
-        if (pcrd->params.eGeom == PullGroupGeometry::Cylinder)
+        if (pcrd.params.eGeom == PullGroupGeometry::Cylinder)
         {
-            pull_group_work_t*    pdyna       = &pull->dyna[c];
-            pull_group_work_t*    pgrp        = &pull->group[pcrd->params.group[1]];
-            PullCoordSpatialData& spatialData = pcrd->spatialData;
-
-            auto buffer = gmx::constArrayRefFromArray(
-                    comm->cylinderBuffer.data() + c * c_cylinderBufferStride, c_cylinderBufferStride);
-            double wmass   = buffer[0];
-            double wwmass  = buffer[1];
-            pdyna->mwscale = 1.0 / wmass;
+            pull_group_work_t&    dynamicGroup0 = *pcrd.dynamicGroup0;
+            pull_group_work_t&    group1        = pull->group[pcrd.params.group[1]];
+            PullCoordSpatialData& spatialData   = pcrd.spatialData;
+
+            auto buffer = gmx::constArrayRefFromArray(comm->cylinderBuffer.data() + bufferOffset,
+                                                      c_cylinderBufferStride);
+            bufferOffset += c_cylinderBufferStride;
+            double wmass          = buffer[0];
+            double wwmass         = buffer[1];
+            dynamicGroup0.mwscale = 1.0 / wmass;
             /* Cylinder pulling can't be used with constraints, but we set
              * wscale and invtm anyhow, in case someone would like to use them.
              */
-            pdyna->wscale = wmass / wwmass;
-            pdyna->invtm  = wwmass / (wmass * wmass);
+            dynamicGroup0.wscale = wmass / wwmass;
+            dynamicGroup0.invtm  = wwmass / (wmass * wmass);
 
             /* We store the deviation of the COM from the reference location
              * used above, since we need it when we apply the radial forces
@@ -342,9 +339,9 @@ static void make_cyl_refgrps(const t_commrec*               cr,
             spatialData.cyl_dev = 0;
             for (int m = 0; m < DIM; m++)
             {
-                double reference = pgrp->x[m] - spatialData.vec[m] * pcrd->value_ref;
-                double dist      = -spatialData.vec[m] * buffer[2] * pdyna->mwscale;
-                pdyna->x[m]      = reference - dist;
+                double reference   = group1.x[m] - spatialData.vec[m] * pcrd.value_ref;
+                double dist        = -spatialData.vec[m] * buffer[2] * dynamicGroup0.mwscale;
+                dynamicGroup0.x[m] = reference - dist;
                 spatialData.cyl_dev += dist;
             }
             /* Now we know the exact COM of the cylinder reference group,
@@ -360,12 +357,12 @@ static void make_cyl_refgrps(const t_commrec*               cr,
             if (debug)
             {
                 fprintf(debug,
-                        "Pull cylinder group %zu:%8.3f%8.3f%8.3f m:%8.3f\n",
-                        c,
-                        pdyna->x[0],
-                        pdyna->x[1],
-                        pdyna->x[2],
-                        1.0 / pdyna->invtm);
+                        "Pull cylinder group %d:%8.3f%8.3f%8.3f m:%8.3f\n",
+                        pcrd.params.coordIndex,
+                        dynamicGroup0.x[0],
+                        dynamicGroup0.x[1],
+                        dynamicGroup0.x[2],
+                        1.0 / dynamicGroup0.invtm);
                 fprintf(debug,
                         "ffrad %8.3f %8.3f %8.3f\n",
                         spatialData.ffrad[XX],
@@ -638,17 +635,18 @@ void pull_calc_coms(const t_commrec*               cr,
                 }
                 else
                 {
-#pragma omp parallel for num_threads(pull->nthreads) schedule(static)
-                    for (int t = 0; t < pull->nthreads; t++)
+                    const int numThreads = pgrp->numThreads();
+#pragma omp parallel for num_threads(numThreads) schedule(static)
+                    for (int t = 0; t < numThreads; t++)
                     {
-                        int ind_start = (pgrp->atomSet.numAtomsLocal() * (t + 0)) / pull->nthreads;
-                        int ind_end   = (pgrp->atomSet.numAtomsLocal() * (t + 1)) / pull->nthreads;
+                        int ind_start = (pgrp->atomSet.numAtomsLocal() * (t + 0)) / numThreads;
+                        int ind_end   = (pgrp->atomSet.numAtomsLocal() * (t + 1)) / numThreads;
                         sum_com_part(
                                 pgrp, ind_start, ind_end, x, xp, masses, pbc, x_pbc, &pull->comSums[t]);
                     }
 
                     /* Reduce the thread contributions to sum_com[0] */
-                    for (int t = 1; t < pull->nthreads; t++)
+                    for (int t = 1; t < numThreads; t++)
                     {
                         comSumsTotal.sum_wm += pull->comSums[t].sum_wm;
                         comSumsTotal.sum_wwm += pull->comSums[t].sum_wwm;
@@ -677,18 +675,19 @@ void pull_calc_coms(const t_commrec*               cr,
                  * This uses a slab of the system, thus we always have many
                  * atoms in the pull groups. Therefore, always use threads.
                  */
-#pragma omp parallel for num_threads(pull->nthreads) schedule(static)
-                for (int t = 0; t < pull->nthreads; t++)
+                const int numThreads = pgrp->numThreads();
+#pragma omp parallel for num_threads(numThreads) schedule(static)
+                for (int t = 0; t < numThreads; t++)
                 {
-                    int ind_start = (pgrp->atomSet.numAtomsLocal() * (t + 0)) / pull->nthreads;
-                    int ind_end   = (pgrp->atomSet.numAtomsLocal() * (t + 1)) / pull->nthreads;
+                    int ind_start = (pgrp->atomSet.numAtomsLocal() * (t + 0)) / numThreads;
+                    int ind_end   = (pgrp->atomSet.numAtomsLocal() * (t + 1)) / numThreads;
                     sum_com_part_cosweight(
                             pgrp, ind_start, ind_end, pull->cosdim, twopi_box, x, xp, masses, &pull->comSums[t]);
                 }
 
                 /* Reduce the thread contributions to comSums[0] */
                 ComSums& comSumsTotal = pull->comSums[0];
-                for (int t = 1; t < pull->nthreads; t++)
+                for (int t = 1; t < numThreads; t++)
                 {
                     comSumsTotal.sum_cm += pull->comSums[t].sum_cm;
                     comSumsTotal.sum_sm += pull->comSums[t].sum_sm;
@@ -919,9 +918,9 @@ int pullCheckPbcWithinGroups(const pull_t& pull, gmx::ArrayRef<const gmx::RVec>
 
     /* Determine what dimensions are used for each group by pull coordinates */
     std::vector<BoolVec> dimUsed(pull.group.size(), { false, false, false });
-    for (size_t c = 0; c < pull.coord.size(); c++)
+    for (const pull_coord_work_t& pcrd : pull.coord)
     {
-        const t_pull_coord& coordParams = pull.coord[c].params;
+        const t_pull_coord& coordParams = pcrd.params;
         for (int groupIndex = 0; groupIndex < coordParams.ngroup; groupIndex++)
         {
             for (int d = 0; d < DIM; d++)
@@ -970,9 +969,9 @@ bool pullCheckPbcWithinGroup(const pull_t&                  pull,
 
     /* Determine what dimensions are used for each group by pull coordinates */
     BoolVec dimUsed = { false, false, false };
-    for (size_t c = 0; c < pull.coord.size(); c++)
+    for (const pull_coord_work_t& pcrd : pull.coord)
     {
-        const t_pull_coord& coordParams = pull.coord[c].params;
+        const t_pull_coord& coordParams = pcrd.params;
         for (int groupIndex = 0; groupIndex < coordParams.ngroup; groupIndex++)
         {
             if (coordParams.group[groupIndex] == groupNr)
@@ -1090,16 +1089,17 @@ void initPullComFromPrevStep(const t_commrec*               cr,
             }
             else
             {
-#pragma omp parallel for num_threads(pull->nthreads) schedule(static)
-                for (int t = 0; t < pull->nthreads; t++)
+                const int numThreads = pgrp->numThreads();
+#pragma omp parallel for num_threads(numThreads) schedule(static)
+                for (int t = 0; t < numThreads; t++)
                 {
-                    int ind_start = (pgrp->atomSet.numAtomsLocal() * (t + 0)) / pull->nthreads;
-                    int ind_end   = (pgrp->atomSet.numAtomsLocal() * (t + 1)) / pull->nthreads;
+                    int ind_start = (pgrp->atomSet.numAtomsLocal() * (t + 0)) / numThreads;
+                    int ind_end   = (pgrp->atomSet.numAtomsLocal() * (t + 1)) / numThreads;
                     sum_com_part(pgrp, ind_start, ind_end, x, {}, masses, pbc, x_pbc, &pull->comSums[t]);
                 }
 
                 /* Reduce the thread contributions to sum_com[0] */
-                for (int t = 1; t < pull->nthreads; t++)
+                for (int t = 1; t < numThreads; t++)
                 {
                     comSumsTotal.sum_wm += pull->comSums[t].sum_wm;
                     comSumsTotal.sum_wwm += pull->comSums[t].sum_wwm;
-- 
2.22.0