From 581ebdfd1759392b03cec1fa538e479fe495df5e Mon Sep 17 00:00:00 2001
From: Berk Hess <hess@kth.se>
Date: Mon, 11 Aug 2014 18:11:17 +0200
Subject: [PATCH] Reduced the cost of the pull communication

With more than 32 ranks, a sub-communicator will be used
for the pull communication. This reduces the pull communication
significantly with small pull groups. With large pull groups the total
simulation performance might not improve much, because ranks
that are not in the sub-communicator will later wait for the pull
ranks during the communication for the constraints.

Added a pull_comm_t struct to separate the data used for communication.

Change-Id: I92b64d098b508b11718ef3ae175b771032ad7be2
---
 src/gromacs/domdec/domdec.cpp       |   2 +-
 src/gromacs/pulling/pull.cpp        | 181 +++++++++++++++++++++---
 src/gromacs/pulling/pull.h          |   4 +-
 src/gromacs/pulling/pull_internal.h |  25 +++-
 src/gromacs/pulling/pullutil.cpp    | 206 ++++++++++++++++++++--------
 5 files changed, 333 insertions(+), 85 deletions(-)

diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp
index c61e2893bc..2cb8e7a64d 100644
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -9858,7 +9858,7 @@ void dd_partition_system(FILE                *fplog,
     if (ir->bPull)
     {
         /* Update the local pull groups */
-        dd_make_local_pull_groups(dd, ir->pull_work, mdatoms);
+        dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
     }
 
     if (ir->bRot)
diff --git a/src/gromacs/pulling/pull.cpp b/src/gromacs/pulling/pull.cpp
index 3c970da0e4..5a4f8e0150 100644
--- a/src/gromacs/pulling/pull.cpp
+++ b/src/gromacs/pulling/pull.cpp
@@ -38,6 +38,8 @@
 
 #include "pull.h"
 
+#include "config.h"
+
 #include <assert.h>
 #include <math.h>
 #include <stdio.h>
@@ -1158,21 +1160,24 @@ real pull_potential(struct pull_t *pull, t_mdatoms *md, t_pbc *pbc,
                     t_commrec *cr, double t, real lambda,
                     rvec *x, rvec *f, tensor vir, real *dvdlambda)
 {
-    real V, dVdl;
+    real V = 0, dVdl;
 
     assert(pull != NULL);
 
-    pull_calc_coms(cr, pull, md, pbc, t, x, NULL);
+    if (pull->comm.bParticipate)
+    {
+        pull_calc_coms(cr, pull, md, pbc, t, x, NULL);
 
-    do_pull_pot(pull, pbc, t, lambda,
-                &V, MASTER(cr) ? vir : NULL, &dVdl);
+        do_pull_pot(pull, pbc, t, lambda,
+                    &V, MASTER(cr) ? vir : NULL, &dVdl);
 
-    /* Distribute forces over pulled groups */
-    apply_forces(pull, md, f);
+        /* Distribute forces over the pull groups */
+        apply_forces(pull, md, f);
 
-    if (MASTER(cr))
-    {
-        *dvdlambda += dVdl;
+        if (MASTER(cr))
+        {
+            *dvdlambda += dVdl;
+        }
     }
 
     return (MASTER(cr) ? V : 0.0);
@@ -1184,9 +1189,12 @@ void pull_constraint(struct pull_t *pull, t_mdatoms *md, t_pbc *pbc,
 {
     assert(pull != NULL);
 
-    pull_calc_coms(cr, pull, md, pbc, t, x, xp);
+    if (pull->comm.bParticipate)
+    {
+        pull_calc_coms(cr, pull, md, pbc, t, x, xp);
 
-    do_constraint(pull, pbc, xp, v, MASTER(cr), vir, dt, t);
+        do_constraint(pull, pbc, xp, v, MASTER(cr), vir, dt, t);
+    }
 }
 
 static void make_local_pull_group(gmx_ga2la_t ga2la,
@@ -1227,10 +1235,17 @@ static void make_local_pull_group(gmx_ga2la_t ga2la,
     }
 }
 
-void dd_make_local_pull_groups(gmx_domdec_t *dd, struct pull_t *pull, t_mdatoms *md)
+void dd_make_local_pull_groups(t_commrec *cr, struct pull_t *pull, t_mdatoms *md)
 {
-    gmx_ga2la_t ga2la;
-    int         g;
+    gmx_domdec_t *dd;
+    pull_comm_t  *comm;
+    gmx_ga2la_t   ga2la;
+    gmx_bool      bMustParticipate;
+    int           g;
+
+    dd = cr->dd;
+
+    comm = &pull->comm;
 
     if (dd)
     {
@@ -1241,10 +1256,103 @@ void dd_make_local_pull_groups(gmx_domdec_t *dd, struct pull_t *pull, t_mdatoms
         ga2la = NULL;
     }
 
+    /* We always make the master node participate, such that it can do i/o
+     * and to simplify MC type extensions people might have.
+     */
+    bMustParticipate = (comm->bParticipateAll || dd == NULL || DDMASTER(dd));
+
     for (g = 0; g < pull->ngroup; g++)
     {
+        int a;
+
         make_local_pull_group(ga2la, &pull->group[g],
                               0, md->homenr);
+
+        /* We should participate if we have pull or pbc atoms */
+        if (!bMustParticipate &&
+            (pull->group[g].nat_loc > 0 ||
+             (pull->group[g].params.pbcatom >= 0 &&
+              ga2la_get_home(dd->ga2la, pull->group[g].params.pbcatom, &a))))
+        {
+            bMustParticipate = TRUE;
+        }
+    }
+
+    if (!comm->bParticipateAll)
+    {
+        /* Keep currently not required ranks in the communicator
+         * if they needed to participate up to 20 decompositions ago.
+         * This avoids frequent rebuilds due to atoms jumping back and forth.
+         */
+        const gmx_int64_t history_count = 20;
+        gmx_bool          bWillParticipate;
+        int               count[2];
+
+        /* Increase the decomposition counter for the current call */
+        comm->setup_count++;
+
+        if (bMustParticipate)
+        {
+            comm->must_count = comm->setup_count;
+        }
+
+        bWillParticipate =
+            bMustParticipate ||
+            (comm->bParticipate &&
+             comm->must_count >= comm->setup_count - history_count);
+
+        if (debug && dd != NULL)
+        {
+            fprintf(debug, "Our DD rank (%3d) pull #atoms>0 or master: %d, will be part %d\n",
+                    dd->rank, bMustParticipate, bWillParticipate);
+        }
+
+        if (bWillParticipate)
+        {
+            /* Count the number of ranks that we want to have participating */
+            count[0] = 1;
+            /* Count the number of ranks that need to be added */
+            count[1] = comm->bParticipate ? 0 : 1;
+        }
+        else
+        {
+            count[0] = 0;
+            count[1] = 0;
+        }
+
+        /* The cost of this global operation will be less that the cost
+         * of the extra MPI_Comm_split calls that we can avoid.
+         */
+        gmx_sumi(2, count, cr);
+
+        /* If we are missing ranks or if we have 20% more ranks than needed
+         * we make a new sub-communicator.
+         */
+        if (count[1] > 0 || 6*count[0] < 5*comm->nparticipate)
+        {
+            if (debug)
+            {
+                fprintf(debug, "Creating new pull subcommunicator of size %d\n",
+                        count[0]);
+            }
+
+#ifdef GMX_MPI
+            if (comm->mpi_comm_com != MPI_COMM_NULL)
+            {
+                MPI_Comm_free(&comm->mpi_comm_com);
+            }
+
+            /* This might be an extremely expensive operation, so we try
+             * to avoid this splitting as much as possible.
+             */
+            assert(dd != NULL);
+            MPI_Comm_split(dd->mpi_comm_all, bWillParticipate ? 0 : 1, dd->rank,
+                           &comm->mpi_comm_com);
+#endif
+
+            comm->bParticipate = bWillParticipate;
+            comm->nparticipate = count[0];
+        }
     }
 
     /* Since the PBC of atoms might have changed, we need to update the PBC */
@@ -1437,6 +1545,7 @@ init_pull(FILE *fplog, const pull_params_t *pull_params, const t_inputrec *ir,
           gmx_bool bOutFile, unsigned long Flags)
 {
     struct pull_t *pull;
+    pull_comm_t   *comm;
     int            g, c, m;
 
     snew(pull, 1);
@@ -1604,9 +1713,6 @@ init_pull(FILE *fplog, const pull_params_t *pull_params, const t_inputrec *ir,
         }
     }
 
-    pull->rbuf     = NULL;
-    pull->dbuf     = NULL;
-    pull->dbuf_cyl = NULL;
     pull->bRefAt   = FALSE;
     pull->cosdim   = -1;
     for (g = 0; g < pull->ngroup; g++)
@@ -1724,6 +1830,35 @@ init_pull(FILE *fplog, const pull_params_t *pull_params, const t_inputrec *ir,
         }
     }
 
+    comm = &pull->comm;
+
+#ifdef GMX_MPI
+    /* Use a sub-communicator when we have more than 32 ranks */
+    comm->bParticipateAll = (cr == NULL || !DOMAINDECOMP(cr) ||
+                             cr->dd->nnodes <= 32 ||
+                             getenv("GMX_PULL_PARTICIPATE_ALL") != NULL);
+    /* This sub-commicator is not used with comm->bParticipateAll,
+     * so we can always initialize it to NULL.
+     */
+    comm->mpi_comm_com    = MPI_COMM_NULL;
+    comm->nparticipate    = 0;
+#else
+    /* No MPI: 1 rank: all ranks pull */
+    comm->bParticipateAll = TRUE;
+#endif
+    comm->bParticipate    = comm->bParticipateAll;
+    comm->setup_count     = 0;
+    comm->must_count      = 0;
+
+    if (!comm->bParticipateAll && fplog != NULL)
+    {
+        fprintf(fplog, "Will use a sub-communicator for pull communication\n");
+    }
+
+    comm->rbuf     = NULL;
+    comm->dbuf     = NULL;
+    comm->dbuf_cyl = NULL;
+
     /* We still need to initialize the PBC reference coordinates */
     pull->bSetPBCatoms = TRUE;
 
@@ -1783,9 +1918,15 @@ static void destroy_pull(struct pull_t *pull)
     sfree(pull->dyna);
     sfree(pull->coord);
 
-    sfree(pull->rbuf);
-    sfree(pull->dbuf);
-    sfree(pull->dbuf_cyl);
+#ifdef GMX_MPI
+    if (pull->comm.mpi_comm_com != MPI_COMM_NULL)
+    {
+        MPI_Comm_free(&pull->comm.mpi_comm_com);
+    }
+#endif
+    sfree(pull->comm.rbuf);
+    sfree(pull->comm.dbuf);
+    sfree(pull->comm.dbuf_cyl);
 
     sfree(pull);
 }
diff --git a/src/gromacs/pulling/pull.h b/src/gromacs/pulling/pull.h
index c73ca640dc..2d6898c5f9 100644
--- a/src/gromacs/pulling/pull.h
+++ b/src/gromacs/pulling/pull.h
@@ -122,11 +122,11 @@ void pull_constraint(struct pull_t *pull, t_mdatoms *md, struct t_pbc *pbc,
 /*! \brief Make a selection of the home atoms for all pull groups.
  * Should be called at every domain decomposition.
  *
- * \param dd             Structure containing domain decomposition data.
+ * \param cr             Structure for communication info.
  * \param pull           The pull group.
  * \param md             All atoms.
  */
-void dd_make_local_pull_groups(gmx_domdec_t *dd,
+void dd_make_local_pull_groups(t_commrec *cr,
                                struct pull_t *pull, t_mdatoms *md);
 
 
diff --git a/src/gromacs/pulling/pull_internal.h b/src/gromacs/pulling/pull_internal.h
index ad5c10f0e0..75fbc2ac18 100644
--- a/src/gromacs/pulling/pull_internal.h
+++ b/src/gromacs/pulling/pull_internal.h
@@ -50,6 +50,8 @@
 #ifndef GMX_PULLING_PULL_INTERNAL_H
 #define GMX_PULLING_PULL_INTERNAL_H
 
+#include "config.h"
+
 #include "gromacs/legacyheaders/typedefs.h"
 
 #ifdef __cplusplus
@@ -98,6 +100,23 @@ typedef struct
 }
 pull_coord_work_t;
 
+typedef struct {
+    gmx_bool    bParticipateAll; /* Do all ranks always participate in pulling? */
+    gmx_bool    bParticipate;    /* Does our rank participate in pulling? */
+#ifdef GMX_MPI
+    MPI_Comm    mpi_comm_com;    /* Communicator for pulling */
+#endif
+    int         nparticipate;    /* The number of ranks participating */
+
+    gmx_int64_t setup_count;     /* The number of decomposition calls */
+    gmx_int64_t must_count;      /* The last count our rank needed to be part */
+
+    rvec       *rbuf;            /* COM calculation buffer */
+    dvec       *dbuf;            /* COM calculation buffer */
+    double     *dbuf_cyl;        /* cylinder ref. groups calculation buffer */
+}
+pull_comm_t;
+
 struct pull_t
 {
     pull_params_t      params;       /* The pull parameters, from inputrec */
@@ -117,11 +136,11 @@ struct pull_t
     pull_coord_work_t *coord;        /* The pull group param and work data */
 
     gmx_bool           bCylinder;    /* Is group 0 a cylinder group? */
+
     gmx_bool           bSetPBCatoms; /* Do we need to set x_pbc for the groups? */
 
-    rvec              *rbuf;         /* COM calculation buffer */
-    dvec              *dbuf;         /* COM calculation buffer */
-    double            *dbuf_cyl;     /* cylinder ref. groups COM calculation buffer */
+    pull_comm_t        comm;         /* Communication parameters, communicator and buffers */
+
     FILE              *out_x;        /* Output file for pull data */
     FILE              *out_f;        /* Output file for pull data */
 };
diff --git a/src/gromacs/pulling/pullutil.cpp b/src/gromacs/pulling/pullutil.cpp
index 7345aad857..29b2ae8cd0 100644
--- a/src/gromacs/pulling/pullutil.cpp
+++ b/src/gromacs/pulling/pullutil.cpp
@@ -36,6 +36,8 @@
  */
 #include "gmxpre.h"
 
+#include "config.h"
+
 #include <assert.h>
 #include <stdlib.h>
 
@@ -52,8 +54,89 @@
 #include "gromacs/pulling/pull_internal.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/futil.h"
+#include "gromacs/utility/real.h"
 #include "gromacs/utility/smalloc.h"
 
+static void pull_reduce_real(t_commrec   *cr,
+                             pull_comm_t *comm,
+                             int          n,
+                             real        *data)
+{
+    if (cr != NULL && PAR(cr))
+    {
+        if (comm->bParticipateAll)
+        {
+            /* Sum the contributions over all DD ranks */
+            gmx_sum(n, data, cr);
+        }
+        else
+        {
+#ifdef GMX_MPI
+#ifdef MPI_IN_PLACE_EXISTS
+            MPI_Allreduce(MPI_IN_PLACE, data, n, GMX_MPI_REAL, MPI_SUM,
+                          comm->mpi_comm_com);
+#else
+            real *buf;
+
+            snew(buf, n);
+
+            MPI_Allreduce(data, buf, n, GMX_MPI_REAL, MPI_SUM,
+                          comm->mpi_comm_com);
+
+            /* Copy the result from the buffer to the input/output data */
+            for (i = 0; i < nr; i++)
+            {
+                data[i] = buf[i];
+            }
+            sfree(buf);
+#endif
+#else
+            gmx_incons("comm->bParticipateAll=FALSE without GMX_MPI");
+#endif
+        }
+    }
+}
+
+static void pull_reduce_double(t_commrec   *cr,
+                               pull_comm_t *comm,
+                               int          n,
+                               double      *data)
+{
+    if (cr != NULL && PAR(cr))
+    {
+        if (comm->bParticipateAll)
+        {
+            /* Sum the contributions over all DD ranks */
+            gmx_sumd(n, data, cr);
+        }
+        else
+        {
+#ifdef GMX_MPI
+#ifdef MPI_IN_PLACE_EXISTS
+            MPI_Allreduce(MPI_IN_PLACE, data, n, MPI_DOUBLE, MPI_SUM,
+                          comm->mpi_comm_com);
+#else
+            double *buf;
+
+            snew(buf, n);
+
+            MPI_Allreduce(data, buf, n, MPI_DOUBLE, MPI_SUM,
+                          comm->mpi_comm_com);
+
+            /* Copy the result from the buffer to the input/output data */
+            for (i = 0; i < nr; i++)
+            {
+                data[i] = buf[i];
+            }
+            sfree(buf);
+#endif
+#else
+            gmx_incons("comm->bParticipateAll=FALSE without GMX_MPI");
+#endif
+        }
+    }
+}
+
 static void pull_set_pbcatom(t_commrec *cr, pull_group_work_t *pgrp,
                              rvec *x,
                              rvec x_pbc)
@@ -99,8 +182,11 @@ static void pull_set_pbcatoms(t_commrec *cr, struct pull_t *pull,
 
     if (cr && PAR(cr) && n > 0)
     {
-        /* Sum over the nodes to get x_pbc from the home node of pbcatom */
-        gmx_sum(pull->ngroup*DIM, x_pbc[0], cr);
+        /* Sum over participating ranks to get x_pbc from the home ranks.
+         * This can be very expensive at high parallelization, so we only
+         * do this after each DD repartitioning.
+         */
+        pull_reduce_real(cr, &pull->comm, pull->ngroup*DIM, x_pbc[0]);
     }
 }
 
@@ -112,11 +198,14 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
     int           c, i, ii, m, start, end;
     rvec          g_x, dx, dir;
     double        inv_cyl_r2;
+    pull_comm_t  *comm;
     gmx_ga2la_t   ga2la = NULL;
 
-    if (pull->dbuf_cyl == NULL)
+    comm = &pull->comm;
+
+    if (comm->dbuf_cyl == NULL)
     {
-        snew(pull->dbuf_cyl, pull->ncoord*stride);
+        snew(comm->dbuf_cyl, pull->ncoord*stride);
     }
 
     if (cr && DOMAINDECOMP(cr))
@@ -243,21 +332,21 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
                 }
             }
         }
-        pull->dbuf_cyl[c*stride+0] = wmass;
-        pull->dbuf_cyl[c*stride+1] = wwmass;
-        pull->dbuf_cyl[c*stride+2] = sum_a;
-        pull->dbuf_cyl[c*stride+3] = radf_fac0[XX];
-        pull->dbuf_cyl[c*stride+4] = radf_fac0[YY];
-        pull->dbuf_cyl[c*stride+5] = radf_fac0[ZZ];
-        pull->dbuf_cyl[c*stride+6] = radf_fac1[XX];
-        pull->dbuf_cyl[c*stride+7] = radf_fac1[YY];
-        pull->dbuf_cyl[c*stride+8] = radf_fac1[ZZ];
+        comm->dbuf_cyl[c*stride+0] = wmass;
+        comm->dbuf_cyl[c*stride+1] = wwmass;
+        comm->dbuf_cyl[c*stride+2] = sum_a;
+        comm->dbuf_cyl[c*stride+3] = radf_fac0[XX];
+        comm->dbuf_cyl[c*stride+4] = radf_fac0[YY];
+        comm->dbuf_cyl[c*stride+5] = radf_fac0[ZZ];
+        comm->dbuf_cyl[c*stride+6] = radf_fac1[XX];
+        comm->dbuf_cyl[c*stride+7] = radf_fac1[YY];
+        comm->dbuf_cyl[c*stride+8] = radf_fac1[ZZ];
     }
 
     if (cr != NULL && PAR(cr))
     {
         /* Sum the contributions over the ranks */
-        gmx_sumd(pull->ncoord*stride, pull->dbuf_cyl, cr);
+        pull_reduce_double(cr, comm, pull->ncoord*stride, comm->dbuf_cyl);
     }
 
     for (c = 0; c < pull->ncoord; c++)
@@ -274,8 +363,8 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
             pdyna = &pull->dyna[c];
             pgrp  = &pull->group[pcrd->params.group[1]];
 
-            wmass          = pull->dbuf_cyl[c*stride+0];
-            wwmass         = pull->dbuf_cyl[c*stride+1];
+            wmass          = comm->dbuf_cyl[c*stride+0];
+            wwmass         = comm->dbuf_cyl[c*stride+1];
             pdyna->mwscale = 1.0/wmass;
             /* Cylinder pulling can't be used with constraints, but we set
              * wscale and invtm anyhow, in case someone would like to use them.
@@ -291,7 +380,7 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
             for (m = 0; m < DIM; m++)
             {
                 g_x[m]         = pgrp->x[m] - pcrd->vec[m]*pcrd->value_ref;
-                dist           = -pcrd->vec[m]*pull->dbuf_cyl[c*stride+2]*pdyna->mwscale;
+                dist           = -pcrd->vec[m]*comm->dbuf_cyl[c*stride+2]*pdyna->mwscale;
                 pdyna->x[m]    = g_x[m] - dist;
                 pcrd->cyl_dev += dist;
             }
@@ -302,8 +391,8 @@ static void make_cyl_refgrps(t_commrec *cr, struct pull_t *pull, t_mdatoms *md,
              */
             for (m = 0; m < DIM; m++)
             {
-                pcrd->ffrad[m] = (pull->dbuf_cyl[c*stride+6+m] +
-                                  pull->dbuf_cyl[c*stride+3+m]*pcrd->cyl_dev)/wmass;
+                pcrd->ffrad[m] = (comm->dbuf_cyl[c*stride+6+m] +
+                                  comm->dbuf_cyl[c*stride+3+m]*pcrd->cyl_dev)/wmass;
             }
 
             if (debug)
@@ -335,21 +424,24 @@ void pull_calc_coms(t_commrec *cr,
                     struct pull_t *pull, t_mdatoms *md, t_pbc *pbc, double t,
                     rvec x[], rvec *xp)
 {
-    int  g;
-    real twopi_box = 0;
+    int          g;
+    real         twopi_box = 0;
+    pull_comm_t *comm;
+
+    comm = &pull->comm;
 
-    if (pull->rbuf == NULL)
+    if (comm->rbuf == NULL)
     {
-        snew(pull->rbuf, pull->ngroup);
+        snew(comm->rbuf, pull->ngroup);
     }
-    if (pull->dbuf == NULL)
+    if (comm->dbuf == NULL)
     {
-        snew(pull->dbuf, 3*pull->ngroup);
+        snew(comm->dbuf, 3*pull->ngroup);
     }
 
     if (pull->bRefAt && pull->bSetPBCatoms)
     {
-        pull_set_pbcatoms(cr, pull, x, pull->rbuf);
+        pull_set_pbcatoms(cr, pull, x, comm->rbuf);
 
         if (cr != NULL && DOMAINDECOMP(cr))
         {
@@ -403,7 +495,7 @@ void pull_calc_coms(t_commrec *cr,
                 if (pgrp->epgrppbc == epgrppbcREFAT)
                 {
                     /* Set the pbc atom */
-                    copy_rvec(pull->rbuf[g], x_pbc);
+                    copy_rvec(comm->rbuf[g], x_pbc);
                 }
 
                 for (i = 0; i < pgrp->nat_loc; i++)
@@ -492,11 +584,11 @@ void pull_calc_coms(t_commrec *cr,
                 }
 
                 /* Copy local sums to a buffer for global summing */
-                copy_dvec(com, pull->dbuf[g*3]);
-                copy_dvec(comp, pull->dbuf[g*3+1]);
-                pull->dbuf[g*3+2][0] = wmass;
-                pull->dbuf[g*3+2][1] = wwmass;
-                pull->dbuf[g*3+2][2] = 0;
+                copy_dvec(com,  comm->dbuf[g*3]);
+                copy_dvec(comp, comm->dbuf[g*3 + 1]);
+                comm->dbuf[g*3 + 2][0] = wmass;
+                comm->dbuf[g*3 + 2][1] = wwmass;
+                comm->dbuf[g*3 + 2][2] = 0;
             }
             else
             {
@@ -538,24 +630,20 @@ void pull_calc_coms(t_commrec *cr,
                 }
 
                 /* Copy local sums to a buffer for global summing */
-                pull->dbuf[g*3  ][0] = cm;
-                pull->dbuf[g*3  ][1] = sm;
-                pull->dbuf[g*3  ][2] = 0;
-                pull->dbuf[g*3+1][0] = ccm;
-                pull->dbuf[g*3+1][1] = csm;
-                pull->dbuf[g*3+1][2] = ssm;
-                pull->dbuf[g*3+2][0] = cmp;
-                pull->dbuf[g*3+2][1] = smp;
-                pull->dbuf[g*3+2][2] = 0;
+                comm->dbuf[g*3  ][0] = cm;
+                comm->dbuf[g*3  ][1] = sm;
+                comm->dbuf[g*3  ][2] = 0;
+                comm->dbuf[g*3+1][0] = ccm;
+                comm->dbuf[g*3+1][1] = csm;
+                comm->dbuf[g*3+1][2] = ssm;
+                comm->dbuf[g*3+2][0] = cmp;
+                comm->dbuf[g*3+2][1] = smp;
+                comm->dbuf[g*3+2][2] = 0;
             }
         }
     }
 
-    if (cr && PAR(cr))
-    {
-        /* Sum the contributions over the nodes */
-        gmx_sumd(pull->ngroup*3*DIM, pull->dbuf[0], cr);
-    }
+    pull_reduce_double(cr, comm, pull->ngroup*3*DIM, comm->dbuf[0]);
 
     for (g = 0; g < pull->ngroup; g++)
     {
@@ -570,8 +658,8 @@ void pull_calc_coms(t_commrec *cr,
                 int    m;
 
                 /* Determine the inverse mass */
-                wmass             = pull->dbuf[g*3+2][0];
-                wwmass            = pull->dbuf[g*3+2][1];
+                wmass             = comm->dbuf[g*3+2][0];
+                wwmass            = comm->dbuf[g*3+2][1];
                 pgrp->mwscale     = 1.0/wmass;
                 /* invtm==0 signals a frozen group, so then we should keep it zero */
                 if (pgrp->invtm != 0)
@@ -582,17 +670,17 @@ void pull_calc_coms(t_commrec *cr,
                 /* Divide by the total mass */
                 for (m = 0; m < DIM; m++)
                 {
-                    pgrp->x[m]    = pull->dbuf[g*3  ][m]*pgrp->mwscale;
+                    pgrp->x[m]      = comm->dbuf[g*3  ][m]*pgrp->mwscale;
                     if (xp)
                     {
-                        pgrp->xp[m] = pull->dbuf[g*3+1][m]*pgrp->mwscale;
+                        pgrp->xp[m] = comm->dbuf[g*3+1][m]*pgrp->mwscale;
                     }
                     if (pgrp->epgrppbc == epgrppbcREFAT)
                     {
-                        pgrp->x[m]    += pull->rbuf[g][m];
+                        pgrp->x[m]      += comm->rbuf[g][m];
                         if (xp)
                         {
-                            pgrp->xp[m] += pull->rbuf[g][m];
+                            pgrp->xp[m] += comm->rbuf[g][m];
                         }
                     }
                 }
@@ -604,14 +692,14 @@ void pull_calc_coms(t_commrec *cr,
                 int    i, ii;
 
                 /* Determine the optimal location of the cosine weight */
-                csw                   = pull->dbuf[g*3][0];
-                snw                   = pull->dbuf[g*3][1];
+                csw                   = comm->dbuf[g*3][0];
+                snw                   = comm->dbuf[g*3][1];
                 pgrp->x[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
                 /* Set the weights for the local atoms */
                 wmass  = sqrt(csw*csw + snw*snw);
-                wwmass = (pull->dbuf[g*3+1][0]*csw*csw +
-                          pull->dbuf[g*3+1][1]*csw*snw +
-                          pull->dbuf[g*3+1][2]*snw*snw)/(wmass*wmass);
+                wwmass = (comm->dbuf[g*3+1][0]*csw*csw +
+                          comm->dbuf[g*3+1][1]*csw*snw +
+                          comm->dbuf[g*3+1][2]*snw*snw)/(wmass*wmass);
 
                 pgrp->mwscale = 1.0/wmass;
                 pgrp->wscale  = wmass/wwmass;
@@ -627,8 +715,8 @@ void pull_calc_coms(t_commrec *cr,
                 }
                 if (xp)
                 {
-                    csw                    = pull->dbuf[g*3+2][0];
-                    snw                    = pull->dbuf[g*3+2][1];
+                    csw                    = comm->dbuf[g*3+2][0];
+                    snw                    = comm->dbuf[g*3+2][1];
                     pgrp->xp[pull->cosdim] = atan2_0_2pi(snw, csw)/twopi_box;
                 }
             }
-- 
2.22.0