Reduced the cost of the pull communication
authorBerk Hess <hess@kth.se>
Mon, 11 Aug 2014 16:11:17 +0000 (18:11 +0200)
committerGerrit Code Review <gerrit@gerrit.gromacs.org>
Thu, 18 Jun 2015 14:08:40 +0000 (16:08 +0200)
With more than 32 ranks, a sub-communicator will be used
for the pull communication. This reduces the pull communication
significantly with small pull groups. With large pull groups the total
simulation performance might not improve much, because ranks
that are not in the sub-communicator will later wait for the pull
ranks during the communication for the constraints.

Added a pull_comm_t struct to separate the data used for communication.

Change-Id: I92b64d098b508b11718ef3ae175b771032ad7be2

src/gromacs/domdec/domdec.cpp
src/gromacs/pulling/pull.cpp
src/gromacs/pulling/pull.h
src/gromacs/pulling/pull_internal.h
src/gromacs/pulling/pullutil.cpp

index c61e2893bc7a38cb3852bd7423fe71c48b9c017b..2cb8e7a64d6122b6622367354a9565fe825f687b 100644 (file)
@@ -9858,7 +9858,7 @@ void dd_partition_system(FILE                *fplog,
     if (ir->bPull)
     {
         /* Update the local pull groups */
-        dd_make_local_pull_groups(dd, ir->pull_work, mdatoms);
+        dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
     }
 
     if (ir->bRot)
index 3c970da0e43bfa5edd4b00df9e7bf36376bbc3b3..5a4f8e0150ea849cd41d7326574c810a2fb14c88 100644 (file)
@@ -38,6 +38,8 @@
 
 #include "pull.h"
 
+#include "config.h"
+
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
@@ -1158,21 +1160,24 @@ real pull_potential(struct pull_t *pull, t_mdatoms *md, t_pbc *pbc,
                     t_commrec *cr, double t, real lambda,
                     rvec *x, rvec *f, tensor vir, real *dvdlambda)
 {
-    real V, dVdl;
+    real V = 0, dVdl;
 
     assert(pull != NULL);
 
-    pull_calc_coms(cr, pull, md, pbc, t, x, NULL);
+    if (pull->comm.bParticipate)
+    {
+        pull_calc_coms(cr, pull, md, pbc, t, x, NULL);
 
-    do_pull_pot(pull, pbc, t, lambda,
-                &V, MASTER(cr) ? vir : NULL, &dVdl);
+        do_pull_pot(pull, pbc, t, lambda,
+                    &V, MASTER(cr) ? vir : NULL, &dVdl);
 
-    /* Distribute forces over pulled groups */
-    apply_forces(pull, md, f);
+        /* Distribute forces over the pull groups */
+        apply_forces(pull, md, f);
 
-    if (MASTER(cr))
-    {
-        *dvdlambda += dVdl;
+        if (MASTER(cr))
+        {
+            *dvdlambda += dVdl;
+        }
     }
 
     return (MASTER(cr) ? V : 0.0);
@@ -1184,9 +1189,12 @@ void pull_constraint(struct pull_t *pull, t_mdatoms *md, t_pbc *pbc,
 {
     assert(pull != NULL);
 
-    pull_calc_coms(cr, pull, md, pbc, t, x, xp);
+    if (pull->comm.bParticipate)
+    {
+        pull_calc_coms(cr, pull, md, pbc, t, x, xp);
 
-    do_constraint(pull, pbc, xp, v, MASTER(cr), vir, dt, t);
+        do_constraint(pull, pbc, xp, v, MASTER(cr), vir, dt, t);
+    }
 }
 
 static void make_local_pull_group(gmx_ga2la_t ga2la,
@@ -1227,10 +1235,17 @@ static void make_local_pull_group(gmx_ga2la_t ga2la,
     }
 }
 
-void dd_make_local_pull_groups(gmx_domdec_t *dd, struct pull_t *pull, t_mdatoms *md)
+void dd_make_local_pull_groups(t_commrec *cr, struct pull_t *pull, t_mdatoms *md)
 {
-    gmx_ga2la_t ga2la;
-    int         g;
+    gmx_domdec_t *dd;
+    pull_comm_t  *comm;
+    gmx_ga2la_t   ga2la;
+    gmx_bool      bMustParticipate;
+    int           g;
+
+    dd = cr->dd;
+
+    comm = &pull->comm;
 
     if (dd)
     {
@@ -1241,10 +1256,103 @@ void dd_make_local_pull_groups(gmx_domdec_t *dd, struct pull_t *pull, t_mdatoms
         ga2la = NULL;
     }
 
+    /* We always make the master node participate, such that it can do i/o
+     * and to simplify MC type extensions people might have.
+     */
+    bMustParticipate = (comm->bParticipateAll || dd == NULL || DDMASTER(dd));
+
     for (g = 0; g < pull->ngroup; g++)
     {
+        int a;
+
         make_local_pull_group(ga2la, &pull->group[g],
                               0, md->homenr);
+
+        /* We should participate if we have pull or pbc atoms */
+        if (!bMustParticipate &&
+            (pull->group[g].nat_loc > 0 ||
+             (pull->group[g].params.pbcatom >= 0 &&
+              ga2la_get_home(dd->ga2la, pull->group[g].params.pbcatom, &a))))
+        {
+            bMustParticipate = TRUE;
+        }
+    }
+
+    if (!comm->bParticipateAll)
+    {
+        /* Keep currently not required ranks in the communicator
+         * if they needed to participate up to 20 decompositions ago.
+         * This avoids frequent rebuilds due to atoms jumping back and forth.
+         */
+        const gmx_int64_t history_count = 20;
+        gmx_bool          bWillParticipate;
+        int               count[2];
+
+        /* Increase the decomposition counter for the current call */
+        comm->setup_count++;
+
+        if (bMustParticipate)
+        {
+            comm->must_count = comm->setup_count;
+        }
+
+        bWillParticipate =
+            bMustParticipate ||
+            (comm->bParticipate &&
+             comm->must_count >= comm->setup_count - history_count);
+
+        if (debug && dd != NULL)
+        {
+            fprintf(debug, "Our DD rank (%3d) pull #atoms>0 or master: %d, will be part %d\n",
+                    dd->rank, bMustParticipate, bWillParticipate);
+        }
+
+        if (bWillParticipate)
+        {
+            /* Count the number of ranks that we want to have participating */
+            count[0] = 1;
+            /* Count the number of ranks that need to be added */
+            count[1] = comm->bParticipate ? 0 : 1;
+        }
+        else
+        {
+            count[0] = 0;
+            count[1] = 0;
+        }
+
+        /* The cost of this global operation will be less that the cost
+         * of the extra MPI_Comm_split calls that we can avoid.
+         */
+        gmx_sumi(2, count, cr);
+
+        /* If we are missing ranks or if we have 20% more ranks than needed
+         * we make a new sub-communicator.
+         */
+        if (count[1] > 0 || 6*count[0] < 5*comm->nparticipate)
+        {
+            if (debug)
+            {
+                fprintf(debug, "Creating new pull subcommunicator of size %d\n",
+                        count[0]);
+            }
+
+#ifdef GMX_MPI
+            if (comm->mpi_comm_com != MPI_COMM_NULL)
+            {
+                MPI_Comm_free(&comm->mpi_comm_com);
+            }
+
+            /* This might be an extremely expensive operation, so we try
+             * to avoid this splitting as much as possible.
+             */
+            assert(dd != NULL);
+            MPI_Comm_split(dd->mpi_comm_all, bWillParticipate ? 0 : 1, dd->rank,
+                           &comm->mpi_comm_com);
+#endif
+
+            comm->bParticipate = bWillParticipate;
+            comm->nparticipate = count[0];
+        }
     }
 
     /* Since the PBC of atoms might have changed, we need to update the PBC */
@@ -1437,6 +1545,7 @@ init_pull(FILE *fplog, const pull_params_t *pull_params, const t_inputrec *ir,
           gmx_bool bOutFile, unsigned long Flags)
 {
     struct pull_t *pull;
+    pull_comm_t   *comm;
     int            g, c, m;
 
     snew(pull, 1);
@@ -1604,9 +1713,6 @@ init_pull(FILE *fplog, const pull_params_t *pull_params, const t_inputrec *ir,
         }
     }
 
-    pull->rbuf     = NULL;
-    pull->dbuf     = NULL;
-    pull->dbuf_cyl = NULL;
     pull->bRefAt   = FALSE;
     pull->cosdim   = -1;
     for (g = 0; g < pull->ngroup; g++)
@@ -1724,6 +1830,35 @@ init_pull(FILE *fplog, const pull_params_t *pull_params, const t_inputrec *ir,
         }
     }
 
+    comm = &pull->comm;
+
+#ifdef GMX_MPI
+    /* Use a sub-communicator when we have more than 32 ranks */
+    comm->bParticipateAll = (cr == NULL || !DOMAINDECOMP(cr) ||
+                             cr->dd->nnodes <= 32 ||
+                             getenv("GMX_PULL_PARTICIPATE_ALL") != NULL);
+    /* This sub-commicator is not used with comm->bParticipateAll,
+     * so we can always initialize it to NULL.
+     */
+    comm->mpi_comm_com    = MPI_COMM_NULL;
+    comm->nparticipate    = 0;
+#else
+    /* No MPI: 1 rank: all ranks pull */
+    comm->bParticipateAll = TRUE;
+#endif
+    comm->bParticipate    = comm->bParticipateAll;
+    comm->setup_count     = 0;
+    comm->must_count      = 0;
+
+    if (!comm->bParticipateAll && fplog != NULL)
+    {
+        fprintf(fplog, "Will use a sub-communicator for pull communication\n");
+    }
+
+    comm->rbuf     = NULL;
+    comm->dbuf     = NULL;
+    comm->dbuf_cyl = NULL;
+
     /* We still need to initialize the PBC reference coordinates */
     pull->bSetPBCatoms = TRUE;
 
@@ -1783,9 +1918,15 @@ static void destroy_pull(struct pull_t *pull)
     sfree(pull->dyna);
     sfree(pull->coord);
 
-    sfree(pull->rbuf);
-    sfree(pull->dbuf);
-    sfree(pull->dbuf_cyl);
+#ifdef GMX_MPI
+    if (pull->comm.mpi_comm_com != MPI_COMM_NULL)
+    {
+        MPI_Comm_free(&pull->comm.mpi_comm_com);
+    }
+#endif
+    sfree(pull->comm.rbuf);
+    sfree(pull->comm.dbuf);
+    sfree(pull->comm.dbuf_cyl);
 
     sfree(pull);
 }
index c73ca640dc7455ee6fb6536fcf2d9061e5d69a1d..2d6898c5f9d6fa79c1cef434e5539343a45c498b 100644 (file)
@@ -122,11 +122,11 @@ void pull_constraint(struct pull_t *pull, t_mdatoms *md, struct t_pbc *pbc,
 /*! \brief Make a selection of the home atoms for all pull groups.
  * Should be called at every domain decomposition.
  *
- * \param dd             Structure containing domain decomposition data.
+ * \param cr             Structure for communication info.
  * \param pull           The pull group.
  * \param md             All atoms.
  */
-void dd_make_local_pull_groups(gmx_domdec_t *dd,
+void dd_make_local_pull_groups(t_commrec *cr,
                                struct pull_t *pull, t_mdatoms *md);
 
 
index ad5c10f0e06185df5faac6b260a4f78d9baf04c7..75fbc2ac18e43f40529e1416becfe1c6134ee36e 100644 (file)
@@ -50,6 +50,8 @@
 #ifndef GMX_PULLING_PULL_INTERNAL_H
 #define GMX_PULLING_PULL_INTERNAL_H
 
+#include "config.h"
+
 #include "gromacs/legacyheaders/typedefs.h"
 
 #ifdef __cplusplus
@@ -98,6 +100,23 @@ typedef struct
 }
 pull_coord_work_t;
 
+typedef struct {
+    gmx_bool    bParticipateAll; /* Do all ranks always participate in pulling? */
+    gmx_bool    bParticipate;    /* Does our rank participate in pulling? */
+#ifdef GMX_MPI
+    MPI_Comm    mpi_comm_com;    /* Communicator for pulling */
+#endif
+    int         nparticipate;    /* The number of ranks participating */
+
+    gmx_int64_t setup_count;     /* The number of decomposition calls */
+    gmx_int64_t must_count;      /* The last count our rank needed to be part */
+
+    rvec       *rbuf;            /* COM calculation buffer */
+    dvec       *dbuf;            /* COM calculation buffer */
+    double     *dbuf_cyl;        /* cylinder ref. groups calculation buffer */
+}
+pull_comm_t;
+
 struct pull_t
 {
     pull_params_t      params;       /* The pull parameters, from inputrec */
@@ -117,11 +136,11 @@ struct pull_t
     pull_coord_work_t *coord;        /* The pull group param and work data */
 
     gmx_bool           bCylinder;    /* Is group 0 a cylinder group? */
+
     gmx_bool           bSetPBCatoms; /* Do we need to set x_pbc for the groups? */
 
-    rvec              *rbuf;         /* COM calculation buffer */
-    dvec              *dbuf;         /* COM calculation buffer */
-    double            *dbuf_cyl;     /* cylinder ref. groups COM calculation buffer */
+    pull_comm_t        comm;         /* Communication parameters, communicator and buffers */
+
     FILE              *out_x;        /* Output file for pull data */
     FILE              *out_f;        /* Output file for pull data */
 };
index 7345aad8570be6e214a9df46ebe952e8d655431b..29b2ae8cd01086dee756c54d34c22b7b49670623 100644 (file)
@@ -36,6 +36,8 @@
  */
 #include "gmxpre.h"
 
+#include "config.h"
+
 #include <assert.h>
 #include <stdlib.h>
 
 #include "gromacs/pulling/pull_internal.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/futil.h"
+#include "gromacs/utility/real.h"
 #include "gromacs/utility/smalloc.h"
 
+static void pull_reduce_real(t_commrec   *cr,
+                             pull_comm_t *comm,
+                             int          n,
+                             real        *data)
+{
+    if (cr != NULL && PAR(cr))
+    {
+        if (comm->bParticipateAll)
+        {
+            /* Sum the contributions over all DD ranks */
+            gmx_sum(n, data, cr);
+        }
+        else
+        {
+#ifdef GMX_MPI
+#ifdef MPI_IN_PLACE_EXISTS
+            MPI_Allreduce(MPI_IN_PLACE, data, n, GMX_MPI_REAL, MPI_SUM,
+                          comm->mpi_comm_com);
+#else
+            real *buf;
+
+            snew(buf, n);
+
+            MPI_Allreduce(data, buf, n, GMX_MPI_REAL, MPI_SUM,
+                          comm->mpi_comm_com);
+
+            /* Copy the result from the buffer to the input/output data */
+            for (i = 0; i < nr; i++)
+            {
+                data[i] = buf[i];
+            }
+            sfree(buf);
+#endif
+#else
+            gmx_incons("comm->bParticipateAll=FALSE without GMX_MPI");
+#endif
+        }
+    }
+}
+
+static void pull_reduce_double(t_commrec   *cr,
+                               pull_comm_t *comm,
+                               int          n,
+                               double      *data)
+{
+    if (cr != NULL && PAR(cr))
+    {
+        if (comm->bParticipateAll)
+        {
+            /* Sum the contributions over all DD ranks */
+            gmx_sumd(n, data, cr);
+        }
+        else
+        {
+#ifdef GMX_MPI
+#ifdef MPI_IN_PLACE_EXISTS
+            MPI_Allreduce(MPI_IN_PLACE, data, n, MPI_DOUBLE, MPI_SUM,
+                          comm->mpi_comm_com);
+#else
+            double *buf;
+
+            snew(buf, n);
+
+            MPI_Allreduce(data, buf, n, MPI_DOUBLE, MPI_SUM,
+                          comm->mpi_comm_com);
+
+            /* Copy the result from the buffer to the input/output data */
+            for (i = 0; i < nr; i++)
+            {
+                data[i] = buf[i];
+            }
+            sfree(buf);
+#endif
+#else
+            gmx_incons("comm->bParticipateAll=FALSE without GMX_MPI");
+#endif
+        }
+    }
+}
+
 static void pull_set_pbcatom(t_commrec *cr, pull_group_work_t *pgrp,
                              rvec *x,
                              rvec x_pbc)
@@ -99,8 +182,11 @@ static void pull_set_pbcatoms(t_commrec *cr, struct pull_t *pull,
 
     if (cr && PAR(cr) && n > 0)
     {
-        /* Sum over the nodes to get x_pbc from the home node of pbcatom */
-        gmx_sum(pull->ngroup*DIM, x_pbc[0], cr);
+        /* Sum over participating ranks to get x_pbc from the home ranks.
+         * This can be very expensive at high parallelization, so we only
+         * do this after each DD repartitioning.
+         */
+        pull_reduce_real(cr, &pull->comm, pull->ngroup*DIM, x_pbc[0]);
     }
 }
 
@@ -112,11 +198,14 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
     int           c, i, ii, m, start, end;
     rvec          g_x, dx, dir;
     double        inv_cyl_r2;
+    pull_comm_t  *comm;
     gmx_ga2la_t   ga2la = NULL;
 
-    if (pull->dbuf_cyl == NULL)
+    comm = &pull->comm;
+
+    if (comm->dbuf_cyl == NULL)
     {
-        snew(pull->dbuf_cyl, pull->ncoord*stride);
+        snew(comm->dbuf_cyl, pull->ncoord*stride);
     }
 
     if (cr && DOMAINDECOMP(cr))
@@ -243,21 +332,21 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
                 }
             }
         }
-        pull->dbuf_cyl[c*stride+0] = wmass;
-        pull->dbuf_cyl[c*stride+1] = wwmass;
-        pull->dbuf_cyl[c*stride+2] = sum_a;
-        pull->dbuf_cyl[c*stride+3] = radf_fac0[XX];
-        pull->dbuf_cyl[c*stride+4] = radf_fac0[YY];
-        pull->dbuf_cyl[c*stride+5] = radf_fac0[ZZ];
-        pull->dbuf_cyl[c*stride+6] = radf_fac1[XX];
-        pull->dbuf_cyl[c*stride+7] = radf_fac1[YY];
-        pull->dbuf_cyl[c*stride+8] = radf_fac1[ZZ];
+        comm->dbuf_cyl[c*stride+0] = wmass;
+        comm->dbuf_cyl[c*stride+1] = wwmass;
+        comm->dbuf_cyl[c*stride+2] = sum_a;
+        comm->dbuf_cyl[c*stride+3] = radf_fac0[XX];
+        comm->dbuf_cyl[c*stride+4] = radf_fac0[YY];
+        comm->dbuf_cyl[c*stride+5] = radf_fac0[ZZ];
+        comm->dbuf_cyl[c*stride+6] = radf_fac1[XX];
+        comm->dbuf_cyl[c*stride+7] = radf_fac1[YY];
+        comm->dbuf_cyl[c*stride+8] = radf_fac1[ZZ];
     }
 
     if (cr != NULL && PAR(cr))
     {
         /* Sum the contributions over the ranks */
-        gmx_sumd(pull->ncoord*stride, pull->dbuf_cyl, cr);
+        pull_reduce_double(cr, comm, pull->ncoord*stride, comm->dbuf_cyl);
     }
 
     for (c = 0; c < pull->ncoord; c++)
@@ -274,8 +363,8 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
             pdyna = &pull->dyna[c];
             pgrp  = &pull->group[pcrd->params.group[1]];
 
-            wmass          = pull->dbuf_cyl[c*stride+0];
-            wwmass         = pull->dbuf_cyl[c*stride+1];
+            wmass          = comm->dbuf_cyl[c*stride+0];
+            wwmass         = comm->dbuf_cyl[c*stride+1];
             pdyna->mwscale = 1.0/wmass;
             /* Cylinder pulling can't be used with constraints, but we set
              * wscale and invtm anyhow, in case someone would like to use them.
@@ -291,7 +380,7 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
             for (m = 0; m < DIM; m++)
             {
                 g_x[m]         = pgrp->x[m] - pcrd->vec[m]*pcrd->value_ref;
-                dist           = -pcrd->vec[m]*pull->dbuf_cyl[c*stride+2]*pdyna->mwscale;
+                dist           = -pcrd->vec[m]*comm->dbuf_cyl[c*stride+2]*pdyna->mwscale;
                 pdyna->x[m]    = g_x[m] - dist;
                 pcrd->cyl_dev += dist;
             }
@@ -302,8 +391,8 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
              */
             for (m = 0; m < DIM; m++)
             {
-                pcrd->ffrad[m] = (pull->dbuf_cyl[c*stride+6+m] +
-                                  pull->dbuf_cyl[c*stride+3+m]*pcrd->cyl_dev)/wmass;
+                pcrd->ffrad[m] = (comm->dbuf_cyl[c*stride+6+m] +
+                                  comm->dbuf_cyl[c*stride+3+m]*pcrd->cyl_dev)/wmass;
             }
 
             if (debug)
@@ -335,21 +424,24 @@ void pull_calc_coms(t_commrec *cr,
                     struct pull_t *pull, t_mdatoms *md, t_pbc *pbc, double t,
                     rvec x[], rvec *xp)
 {
-    int  g;
-    real twopi_box = 0;
+    int          g;
+    real         twopi_box = 0;
+    pull_comm_t *comm;
+
+    comm = &pull->comm;
 
-    if (pull->rbuf == NULL)
+    if (comm->rbuf == NULL)
     {
-        snew(pull->rbuf, pull->ngroup);
+        snew(comm->rbuf, pull->ngroup);
     }
-    if (pull->dbuf == NULL)
+    if (comm->dbuf == NULL)
     {
-        snew(pull->dbuf, 3*pull->ngroup);
+        snew(comm->dbuf, 3*pull->ngroup);
     }
 
     if (pull->bRefAt && pull->bSetPBCatoms)
     {
-        pull_set_pbcatoms(cr, pull, x, pull->rbuf);
+        pull_set_pbcatoms(cr, pull, x, comm->rbuf);
 
         if (cr != NULL && DOMAINDECOMP(cr))
         {
@@ -403,7 +495,7 @@ void pull_calc_coms(t_commrec *cr,
                 if (pgrp->epgrppbc == epgrppbcREFAT)
                 {
                     /* Set the pbc atom */
-                    copy_rvec(pull->rbuf[g], x_pbc);
+                    copy_rvec(comm->rbuf[g], x_pbc);
                 }
 
                 for (i = 0; i < pgrp->nat_loc; i++)
@@ -492,11 +584,11 @@ void pull_calc_coms(t_commrec *cr,
                 }
 
                 /* Copy local sums to a buffer for global summing */
-                copy_dvec(com, pull->dbuf[g*3]);
-                copy_dvec(comp, pull->dbuf[g*3+1]);
-                pull->dbuf[g*3+2][0] = wmass;
-                pull->dbuf[g*3+2][1] = wwmass;
-                pull->dbuf[g*3+2][2] = 0;
+                copy_dvec(com,  comm->dbuf[g*3]);
+                copy_dvec(comp, comm->dbuf[g*3 + 1]);
+                comm->dbuf[g*3 + 2][0] = wmass;
+                comm->dbuf[g*3 + 2][1] = wwmass;
+                comm->dbuf[g*3 + 2][2] = 0;
             }
             else
             {
@@ -538,24 +630,20 @@ void pull_calc_coms(t_commrec *cr,
                 }
 
                 /* Copy local sums to a buffer for global summing */
-                pull->dbuf[g*3  ][0] = cm;
-                pull->dbuf[g*3  ][1] = sm;
-                pull->dbuf[g*3  ][2] = 0;
-                pull->dbuf[g*3+1][0] = ccm;
-                pull->dbuf[g*3+1][1] = csm;
-                pull->dbuf[g*3+1][2] = ssm;
-                pull->dbuf[g*3+2][0] = cmp;
-                pull->dbuf[g*3+2][1] = smp;
-                pull->dbuf[g*3+2][2] = 0;
+                comm->dbuf[g*3  ][0] = cm;
+                comm->dbuf[g*3  ][1] = sm;
+                comm->dbuf[g*3  ][2] = 0;
+                comm->dbuf[g*3+1][0] = ccm;
+                comm->dbuf[g*3+1][1] = csm;
+                comm->dbuf[g*3+1][2] = ssm;
+                comm->dbuf[g*3+2][0] = cmp;
+                comm->dbuf[g*3+2][1] = smp;
+                comm->dbuf[g*3+2][2] = 0;
             }
         }
     }
 
-    if (cr && PAR(cr))
-    {
-        /* Sum the contributions over the nodes */
-        gmx_sumd(pull->ngroup*3*DIM, pull->dbuf[0], cr);
-    }
+    pull_reduce_double(cr, comm, pull->ngroup*3*DIM, comm->dbuf[0]);
 
     for (g = 0; g < pull->ngroup; g++)
     {
@@ -570,8 +658,8 @@ void pull_calc_coms(t_commrec *cr,
                 int    m;
 
                 /* Determine the inverse mass */
-                wmass             = pull->dbuf[g*3+2][0];
-                wwmass            = pull->dbuf[g*3+2][1];
+                wmass             = comm->dbuf[g*3+2][0];
+                wwmass            = comm->dbuf[g*3+2][1];
                 pgrp->mwscale     = 1.0/wmass;
                 /* invtm==0 signals a frozen group, so then we should keep it zero */
                 if (pgrp->invtm != 0)
@@ -582,17 +670,17 @@ void pull_calc_coms(t_commrec *cr,
                 /* Divide by the total mass */
                 for (m = 0; m < DIM; m++)
                 {
-                    pgrp->x[m]    = pull->dbuf[g*3  ][m]*pgrp->mwscale;
+                    pgrp->x[m]      = comm->dbuf[g*3  ][m]*pgrp->mwscale;
                     if (xp)
                     {
-                        pgrp->xp[m] = pull->dbuf[g*3+1][m]*pgrp->mwscale;
+                        pgrp->xp[m] = comm->dbuf[g*3+1][m]*pgrp->mwscale;
                     }
                     if (pgrp->epgrppbc == epgrppbcREFAT)
                     {
-                        pgrp->x[m]    += pull->rbuf[g][m];
+                        pgrp->x[m]      += comm->rbuf[g][m];
                         if (xp)
                         {
-                            pgrp->xp[m] += pull->rbuf[g][m];
+                            pgrp->xp[m] += comm->rbuf[g][m];
                         }
                     }
                 }
@@ -604,14 +692,14 @@ void pull_calc_coms(t_commrec *cr,
                 int    i, ii;
 
                 /* Determine the optimal location of the cosine weight */
-                csw                   = pull->dbuf[g*3][0];
-                snw                   = pull->dbuf[g*3][1];
+                csw                   = comm->dbuf[g*3][0];
+                snw                   = comm->dbuf[g*3][1];
                 pgrp->x[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
                 /* Set the weights for the local atoms */
                 wmass  = sqrt(csw*csw + snw*snw);
-                wwmass = (pull->dbuf[g*3+1][0]*csw*csw +
-                          pull->dbuf[g*3+1][1]*csw*snw +
-                          pull->dbuf[g*3+1][2]*snw*snw)/(wmass*wmass);
+                wwmass = (comm->dbuf[g*3+1][0]*csw*csw +
+                          comm->dbuf[g*3+1][1]*csw*snw +
+                          comm->dbuf[g*3+1][2]*snw*snw)/(wmass*wmass);
 
                 pgrp->mwscale = 1.0/wmass;
                 pgrp->wscale  = wmass/wwmass;
@@ -627,8 +715,8 @@ void pull_calc_coms(t_commrec *cr,
                 }
                 if (xp)
                 {
-                    csw                    = pull->dbuf[g*3+2][0];
-                    snw                    = pull->dbuf[g*3+2][1];
+                    csw                    = comm->dbuf[g*3+2][0];
+                    snw                    = comm->dbuf[g*3+2][1];
                     pgrp->xp[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
                 }
             }