Merge branch release-2018

[alexxy/gromacs.git] / src / gromacs / domdec / domdec.cpp
diff --git a/src/gromacs/domdec/domdec.cpp b/src/gromacs/domdec/domdec.cpp

index 913c2488de8e6237221461b2fd2bc442ade3d62c..4567f43db8d33e188387d353441862f8664cc355 100644 (file)
--- a/src/gromacs/domdec/domdec.cpp
+++ b/src/gromacs/domdec/domdec.cpp
@@ -39,18 +39,21 @@
  
  #include "config.h"
  
-#include <assert.h>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
+#include <cassert>
+#include <climits>
  #include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
  
  #include <algorithm>
  
+#include "gromacs/compat/make_unique.h"
+#include "gromacs/domdec/collect.h"
+#include "gromacs/domdec/dlbtiming.h"
  #include "gromacs/domdec/domdec_network.h"
  #include "gromacs/domdec/ga2la.h"
+#include "gromacs/domdec/localatomsetmanager.h"
  #include "gromacs/ewald/pme.h"
  #include "gromacs/fileio/gmxfio.h"
  #include "gromacs/fileio/pdbio.h"
@@ -66,10 +69,10 @@
  #include "gromacs/math/vectypes.h"
  #include "gromacs/mdlib/calc_verletbuf.h"
  #include "gromacs/mdlib/constr.h"
-#include "gromacs/mdlib/force.h"
+#include "gromacs/mdlib/constraintrange.h"
  #include "gromacs/mdlib/forcerec.h"
-#include "gromacs/mdlib/genborn.h"
  #include "gromacs/mdlib/gmx_omp_nthreads.h"
+#include "gromacs/mdlib/lincs.h"
  #include "gromacs/mdlib/mdatoms.h"
  #include "gromacs/mdlib/mdrun.h"
  #include "gromacs/mdlib/mdsetup.h"
@@ -88,8 +91,6 @@
  #include "gromacs/pbcutil/ishift.h"
  #include "gromacs/pbcutil/pbc.h"
  #include "gromacs/pulling/pull.h"
-#include "gromacs/pulling/pull_rotation.h"
-#include "gromacs/swap/swapcoords.h"
  #include "gromacs/timing/wallcycle.h"
  #include "gromacs/topology/block.h"
  #include "gromacs/topology/idef.h"
@@ -103,34 +104,23 @@
  #include "gromacs/utility/exceptions.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxmpi.h"
-#include "gromacs/utility/qsort_threadsafe.h"
  #include "gromacs/utility/real.h"
  #include "gromacs/utility/smalloc.h"
+#include "gromacs/utility/strconvert.h"
  #include "gromacs/utility/stringutil.h"
  
+#include "atomdistribution.h"
+#include "cellsizes.h"
+#include "distribute.h"
  #include "domdec_constraints.h"
  #include "domdec_internal.h"
  #include "domdec_vsite.h"
-
-#define DDRANK(dd, rank)    (rank)
-#define DDMASTERRANK(dd)   (dd->masterrank)
-
-struct gmx_domdec_master_t
-{
-    /* The cell boundaries */
-    real **cell_x;
-    /* The global charge group division */
-    int   *ncg;    /* Number of home charge groups for each node */
-    int   *index;  /* Index of nnodes+1 into cg */
-    int   *cg;     /* Global charge group index */
-    int   *nat;    /* Number of home atoms for each node. */
-    int   *ibuf;   /* Buffer for communication */
-    rvec  *vbuf;   /* Buffer for state scattering and gathering */
-};
+#include "redistribute.h"
+#include "utility.h"
  
  #define DD_NLOAD_MAX 9
  
-const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on", "on" };
+static const char *edlbs_names[edlbsNR] = { "off", "auto", "locked", "on", "on" };
  
  /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
  #define DD_CGIBS 2
@@ -157,12 +147,6 @@ static const int
                                                   {2, 5, 6},
                                                   {3, 5, 7}};
  
-/* Factors used to avoid problems due to rounding issues */
-#define DD_CELL_MARGIN       1.0001
-#define DD_CELL_MARGIN2      1.00005
-/* Factor to account for pressure scaling during nstlist steps */
-#define DD_PRES_SCALE_MARGIN 1.02
-
  /* Turn on DLB when the load imbalance causes this amount of total loss.
   * There is a bit of overhead with DLB and it's difficult to achieve
   * a load imbalance of less than 2% with DLB.
@@ -172,15 +156,6 @@ static const int
  /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
  #define DD_PERF_LOSS_WARN    0.05
  
-#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
-
-/* Use separate MPI send and receive commands
- * when nnodes <= GMX_DD_NNODES_SENDRECV.
- * This saves memory (and some copying for small nnodes).
- * For high parallelization scatter and gather calls are used.
- */
-#define GMX_DD_NNODES_SENDRECV 4
-
  
  /* We check if to turn on DLB at the first and every 100 DD partitionings.
   * With large imbalance DLB will turn on at the first step, so we can
@@ -210,12 +185,7 @@ static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx
     }
   */
  
-/* This order is required to minimize the coordinate communication in PME
- * which uses decomposition in the x direction.
- */
-#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
-
-static void ddindex2xyz(ivec nc, int ind, ivec xyz)
+static void ddindex2xyz(const ivec nc, int ind, ivec xyz)
  {
      xyz[XX] = ind / (nc[YY]*nc[ZZ]);
      xyz[YY] = (ind / nc[ZZ]) % nc[YY];
@@ -261,11 +231,11 @@ int ddglatnr(const gmx_domdec_t *dd, int i)
      }
      else
      {
-        if (i >= dd->comm->nat[ddnatNR-1])
+        if (i >= dd->comm->atomRanges.numAtomsTotal())
          {
-            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
+            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->atomRanges.numAtomsTotal());
          }
-        atnr = dd->gatindex[i] + 1;
+        atnr = dd->globalAtomIndices[i] + 1;
      }
  
      return atnr;
@@ -276,35 +246,6 @@ t_block *dd_charge_groups_global(gmx_domdec_t *dd)
      return &dd->comm->cgs_gl;
  }
  
-/*! \brief Returns true if the DLB state indicates that the balancer is on. */
-static bool isDlbOn(const gmx_domdec_comm_t *comm)
-{
-    return (comm->dlbState == edlbsOnCanTurnOff ||
-            comm->dlbState == edlbsOnUser);
-}
-/*! \brief Returns true if the DLB state indicates that the balancer is off/disabled.
- */
-static bool isDlbDisabled(const gmx_domdec_comm_t *comm)
-{
-    return (comm->dlbState == edlbsOffUser ||
-            comm->dlbState == edlbsOffForever);
-}
-
-static void vec_rvec_init(vec_rvec_t *v)
-{
-    v->nalloc = 0;
-    v->v      = nullptr;
-}
-
-static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
-{
-    if (n > v->nalloc)
-    {
-        v->nalloc = over_alloc_dd(n);
-        srenew(v->v, v->nalloc);
-    }
-}
-
  void dd_store_state(gmx_domdec_t *dd, t_state *state)
  {
      int i;
@@ -317,7 +258,7 @@ void dd_store_state(gmx_domdec_t *dd, t_state *state)
      state->cg_gl.resize(dd->ncg_home);
      for (i = 0; i < dd->ncg_home; i++)
      {
-        state->cg_gl[i] = dd->index_gl[i];
+        state->cg_gl[i] = dd->globalAtomGroupIndices[i];
      }
  
      state->ddp_count_cg_gl = dd->ddp_count;
@@ -372,45 +313,51 @@ void dd_get_ns_ranges(const gmx_domdec_t *dd, int icg,
      }
  }
  
+int dd_numHomeAtoms(const gmx_domdec_t &dd)
+{
+    return dd.comm->atomRanges.numHomeAtoms();
+}
+
  int dd_natoms_mdatoms(const gmx_domdec_t *dd)
  {
      /* We currently set mdatoms entries for all atoms:
       * local + non-local + communicated for vsite + constraints
       */
  
-    return dd->comm->nat[ddnatNR - 1];
+    return dd->comm->atomRanges.numAtomsTotal();
  }
  
  int dd_natoms_vsite(const gmx_domdec_t *dd)
  {
-    return dd->comm->nat[ddnatVSITE];
+    return dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
  }
  
  void dd_get_constraint_range(const gmx_domdec_t *dd, int *at_start, int *at_end)
  {
-    *at_start = dd->comm->nat[ddnatCON-1];
-    *at_end   = dd->comm->nat[ddnatCON];
+    *at_start = dd->comm->atomRanges.start(DDAtomRanges::Type::Constraints);
+    *at_end   = dd->comm->atomRanges.end(DDAtomRanges::Type::Constraints);
  }
  
-void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
+void dd_move_x(gmx_domdec_t             *dd,
+               matrix                    box,
+               gmx::ArrayRef<gmx::RVec>  x,
+               gmx_wallcycle            *wcycle)
  {
-    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
-    int                   *index, *cgindex;
+    wallcycle_start(wcycle, ewcMOVEX);
+
+    int                    nzone, nat_tot;
      gmx_domdec_comm_t     *comm;
      gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t      *ind;
-    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
+    rvec                   shift = {0, 0, 0};
      gmx_bool               bPBC, bScrew;
  
      comm = dd->comm;
  
-    cgindex = dd->cgindex;
-
-    buf = comm->vbuf.v;
+    const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
  
      nzone   = 1;
-    nat_tot = dd->nat_home;
-    for (d = 0; d < dd->ndim; d++)
+    nat_tot = comm->atomRanges.numHomeAtoms();
+    for (int d = 0; d < dd->ndim; d++)
      {
          bPBC   = (dd->ci[dd->dim[d]] == 0);
          bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
@@ -419,110 +366,111 @@ void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
              copy_rvec(box[dd->dim[d]], shift);
          }
          cd = &comm->cd[d];
-        for (p = 0; p < cd->np; p++)
+        for (const gmx_domdec_ind_t &ind : cd->ind)
          {
-            ind   = &cd->ind[p];
-            index = ind->index;
-            n     = 0;
+            DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
+            gmx::ArrayRef<gmx::RVec>  &sendBuffer = sendBufferAccess.buffer;
+            int                        n          = 0;
              if (!bPBC)
              {
-                for (i = 0; i < ind->nsend[nzone]; i++)
+                for (int g : ind.index)
                  {
-                    at0 = cgindex[index[i]];
-                    at1 = cgindex[index[i]+1];
-                    for (j = at0; j < at1; j++)
+                    for (int j : atomGrouping.block(g))
                      {
-                        copy_rvec(x[j], buf[n]);
+                        sendBuffer[n] = x[j];
                          n++;
                      }
                  }
              }
              else if (!bScrew)
              {
-                for (i = 0; i < ind->nsend[nzone]; i++)
+                for (int g : ind.index)
                  {
-                    at0 = cgindex[index[i]];
-                    at1 = cgindex[index[i]+1];
-                    for (j = at0; j < at1; j++)
+                    for (int j : atomGrouping.block(g))
                      {
                          /* We need to shift the coordinates */
-                        rvec_add(x[j], shift, buf[n]);
+                        for (int d = 0; d < DIM; d++)
+                        {
+                            sendBuffer[n][d] = x[j][d] + shift[d];
+                        }
                          n++;
                      }
                  }
              }
              else
              {
-                for (i = 0; i < ind->nsend[nzone]; i++)
+                for (int g : ind.index)
                  {
-                    at0 = cgindex[index[i]];
-                    at1 = cgindex[index[i]+1];
-                    for (j = at0; j < at1; j++)
+                    for (int j : atomGrouping.block(g))
                      {
                          /* Shift x */
-                        buf[n][XX] = x[j][XX] + shift[XX];
+                        sendBuffer[n][XX] = x[j][XX] + shift[XX];
                          /* Rotate y and z.
                           * This operation requires a special shift force
                           * treatment, which is performed in calc_vir.
                           */
-                        buf[n][YY] = box[YY][YY] - x[j][YY];
-                        buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
+                        sendBuffer[n][YY] = box[YY][YY] - x[j][YY];
+                        sendBuffer[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
                          n++;
                      }
                  }
              }
  
-            if (cd->bInPlace)
+            DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
+
+            gmx::ArrayRef<gmx::RVec>   receiveBuffer;
+            if (cd->receiveInPlace)
              {
-                rbuf = x + nat_tot;
+                receiveBuffer = gmx::arrayRefFromArray(x.data() + nat_tot, ind.nrecv[nzone + 1]);
              }
              else
              {
-                rbuf = comm->vbuf2.v;
+                receiveBuffer = receiveBufferAccess.buffer;
              }
              /* Send and receive the coordinates */
-            dd_sendrecv_rvec(dd, d, dddirBackward,
-                             buf,  ind->nsend[nzone+1],
-                             rbuf, ind->nrecv[nzone+1]);
-            if (!cd->bInPlace)
+            ddSendrecv(dd, d, dddirBackward,
+                       sendBuffer, receiveBuffer);
+
+            if (!cd->receiveInPlace)
              {
-                j = 0;
-                for (zone = 0; zone < nzone; zone++)
+                int j = 0;
+                for (int zone = 0; zone < nzone; zone++)
                  {
-                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
+                    for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
                      {
-                        copy_rvec(rbuf[j], x[i]);
-                        j++;
+                        x[i] = receiveBuffer[j++];
                      }
                  }
              }
-            nat_tot += ind->nrecv[nzone+1];
+            nat_tot += ind.nrecv[nzone+1];
          }
          nzone += nzone;
      }
+
+    wallcycle_stop(wcycle, ewcMOVEX);
  }
  
-void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
+void dd_move_f(gmx_domdec_t             *dd,
+               gmx::ArrayRef<gmx::RVec>  f,
+               rvec                     *fshift,
+               gmx_wallcycle            *wcycle)
  {
-    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
-    int                   *index, *cgindex;
+    wallcycle_start(wcycle, ewcMOVEF);
+
+    int                    nzone, nat_tot;
      gmx_domdec_comm_t     *comm;
      gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t      *ind;
-    rvec                  *buf, *sbuf;
      ivec                   vis;
      int                    is;
      gmx_bool               bShiftForcesNeedPbc, bScrew;
  
      comm = dd->comm;
  
-    cgindex = dd->cgindex;
-
-    buf = comm->vbuf.v;
+    const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
  
      nzone   = comm->zones.n/2;
-    nat_tot = dd->nat_tot;
-    for (d = dd->ndim-1; d >= 0; d--)
+    nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
+    for (int d = dd->ndim-1; d >= 0; d--)
      {
          /* Only forces in domains near the PBC boundaries need to
             consider PBC in the treatment of fshift */
@@ -538,43 +486,48 @@ void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
          is              = IVEC2IS(vis);
  
          cd = &comm->cd[d];
-        for (p = cd->np-1; p >= 0; p--)
+        for (int p = cd->numPulses() - 1; p >= 0; p--)
          {
-            ind      = &cd->ind[p];
-            nat_tot -= ind->nrecv[nzone+1];
-            if (cd->bInPlace)
+            const gmx_domdec_ind_t    &ind  = cd->ind[p];
+            DDBufferAccess<gmx::RVec>  receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
+            gmx::ArrayRef<gmx::RVec>  &receiveBuffer = receiveBufferAccess.buffer;
+
+            nat_tot                        -= ind.nrecv[nzone+1];
+
+            DDBufferAccess<gmx::RVec>  sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
+
+            gmx::ArrayRef<gmx::RVec>   sendBuffer;
+            if (cd->receiveInPlace)
              {
-                sbuf = f + nat_tot;
+                sendBuffer = gmx::arrayRefFromArray(f.data() + nat_tot, ind.nrecv[nzone + 1]);
              }
              else
              {
-                sbuf = comm->vbuf2.v;
-                j    = 0;
-                for (zone = 0; zone < nzone; zone++)
+                sendBuffer = sendBufferAccess.buffer;
+                int j = 0;
+                for (int zone = 0; zone < nzone; zone++)
                  {
-                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
+                    for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
                      {
-                        copy_rvec(f[i], sbuf[j]);
-                        j++;
+                        sendBuffer[j++] = f[i];
                      }
                  }
              }
              /* Communicate the forces */
-            dd_sendrecv_rvec(dd, d, dddirForward,
-                             sbuf, ind->nrecv[nzone+1],
-                             buf,  ind->nsend[nzone+1]);
-            index = ind->index;
+            ddSendrecv(dd, d, dddirForward,
+                       sendBuffer, receiveBuffer);
              /* Add the received forces */
-            n = 0;
+            int n = 0;
              if (!bShiftForcesNeedPbc)
              {
-                for (i = 0; i < ind->nsend[nzone]; i++)
+                for (int g : ind.index)
                  {
-                    at0 = cgindex[index[i]];
-                    at1 = cgindex[index[i]+1];
-                    for (j = at0; j < at1; j++)
+                    for (int j : atomGrouping.block(g))
                      {
-                        rvec_inc(f[j], buf[n]);
+                        for (int d = 0; d < DIM; d++)
+                        {
+                            f[j][d] += receiveBuffer[n][d];
+                        }
                          n++;
                      }
                  }
@@ -583,36 +536,41 @@ void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
              {
                  /* fshift should always be defined if this function is
                   * called when bShiftForcesNeedPbc is true */
-                assert(NULL != fshift);
-                for (i = 0; i < ind->nsend[nzone]; i++)
+                assert(nullptr != fshift);
+                for (int g : ind.index)
                  {
-                    at0 = cgindex[index[i]];
-                    at1 = cgindex[index[i]+1];
-                    for (j = at0; j < at1; j++)
+                    for (int j : atomGrouping.block(g))
                      {
-                        rvec_inc(f[j], buf[n]);
+                        for (int d = 0; d < DIM; d++)
+                        {
+                            f[j][d] += receiveBuffer[n][d];
+                        }
                          /* Add this force to the shift force */
-                        rvec_inc(fshift[is], buf[n]);
+                        for (int d = 0; d < DIM; d++)
+                        {
+                            fshift[is][d] += receiveBuffer[n][d];
+                        }
                          n++;
                      }
                  }
              }
              else
              {
-                for (i = 0; i < ind->nsend[nzone]; i++)
+                for (int g : ind.index)
                  {
-                    at0 = cgindex[index[i]];
-                    at1 = cgindex[index[i]+1];
-                    for (j = at0; j < at1; j++)
+                    for (int j : atomGrouping.block(g))
                      {
                          /* Rotate the force */
-                        f[j][XX] += buf[n][XX];
-                        f[j][YY] -= buf[n][YY];
-                        f[j][ZZ] -= buf[n][ZZ];
+                        f[j][XX] += receiveBuffer[n][XX];
+                        f[j][YY] -= receiveBuffer[n][YY];
+                        f[j][ZZ] -= receiveBuffer[n][ZZ];
                          if (fshift)
                          {
                              /* Add this force to the shift force */
-                            rvec_inc(fshift[is], buf[n]);
+                            for (int d = 0; d < DIM; d++)
+                            {
+                                fshift[is][d] += receiveBuffer[n][d];
+                            }
                          }
                          n++;
                      }
@@ -621,69 +579,81 @@ void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
          }
          nzone /= 2;
      }
+    wallcycle_stop(wcycle, ewcMOVEF);
+}
+
+/* Convenience function for extracting a real buffer from an rvec buffer
+ *
+ * To reduce the number of temporary communication buffers and avoid
+ * cache polution, we reuse gmx::RVec buffers for storing reals.
+ * This functions return a real buffer reference with the same number
+ * of elements as the gmx::RVec buffer (so 1/3 of the size in bytes).
+ */
+static gmx::ArrayRef<real>
+realArrayRefFromRvecArrayRef(gmx::ArrayRef<gmx::RVec> arrayRef)
+{
+    return gmx::arrayRefFromArray(as_rvec_array(arrayRef.data())[0],
+                                  arrayRef.size());
  }
  
  void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
  {
-    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
-    int                   *index, *cgindex;
+    int                    nzone, nat_tot;
      gmx_domdec_comm_t     *comm;
      gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t      *ind;
-    real                  *buf, *rbuf;
  
      comm = dd->comm;
  
-    cgindex = dd->cgindex;
-
-    buf = &comm->vbuf.v[0][0];
+    const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
  
      nzone   = 1;
-    nat_tot = dd->nat_home;
-    for (d = 0; d < dd->ndim; d++)
+    nat_tot = comm->atomRanges.numHomeAtoms();
+    for (int d = 0; d < dd->ndim; d++)
      {
          cd = &comm->cd[d];
-        for (p = 0; p < cd->np; p++)
+        for (const gmx_domdec_ind_t &ind : cd->ind)
          {
-            ind   = &cd->ind[p];
-            index = ind->index;
-            n     = 0;
-            for (i = 0; i < ind->nsend[nzone]; i++)
+            /* Note: We provision for RVec instead of real, so a factor of 3
+             * more than needed. The buffer actually already has this size
+             * and we pass a plain pointer below, so this does not matter.
+             */
+            DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
+            gmx::ArrayRef<real>       sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
+            int                       n          = 0;
+            for (int g : ind.index)
              {
-                at0 = cgindex[index[i]];
-                at1 = cgindex[index[i]+1];
-                for (j = at0; j < at1; j++)
+                for (int j : atomGrouping.block(g))
                  {
-                    buf[n] = v[j];
-                    n++;
+                    sendBuffer[n++] = v[j];
                  }
              }
  
-            if (cd->bInPlace)
+            DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
+
+            gmx::ArrayRef<real>       receiveBuffer;
+            if (cd->receiveInPlace)
              {
-                rbuf = v + nat_tot;
+                receiveBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
              }
              else
              {
-                rbuf = &comm->vbuf2.v[0][0];
+                receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
              }
-            /* Send and receive the coordinates */
-            dd_sendrecv_real(dd, d, dddirBackward,
-                             buf,  ind->nsend[nzone+1],
-                             rbuf, ind->nrecv[nzone+1]);
-            if (!cd->bInPlace)
+            /* Send and receive the data */
+            ddSendrecv(dd, d, dddirBackward,
+                       sendBuffer, receiveBuffer);
+            if (!cd->receiveInPlace)
              {
-                j = 0;
-                for (zone = 0; zone < nzone; zone++)
+                int j = 0;
+                for (int zone = 0; zone < nzone; zone++)
                  {
-                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
+                    for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
                      {
-                        v[i] = rbuf[j];
-                        j++;
+                        v[i] = receiveBuffer[j++];
                      }
                  }
              }
-            nat_tot += ind->nrecv[nzone+1];
+            nat_tot += ind.nrecv[nzone+1];
          }
          nzone += nzone;
      }
@@ -691,59 +661,60 @@ void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
  
  void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
  {
-    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
-    int                   *index, *cgindex;
+    int                    nzone, nat_tot;
      gmx_domdec_comm_t     *comm;
      gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t      *ind;
-    real                  *buf, *sbuf;
  
      comm = dd->comm;
  
-    cgindex = dd->cgindex;
-
-    buf = &comm->vbuf.v[0][0];
+    const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
  
      nzone   = comm->zones.n/2;
-    nat_tot = dd->nat_tot;
-    for (d = dd->ndim-1; d >= 0; d--)
+    nat_tot = comm->atomRanges.end(DDAtomRanges::Type::Zones);
+    for (int d = dd->ndim-1; d >= 0; d--)
      {
          cd = &comm->cd[d];
-        for (p = cd->np-1; p >= 0; p--)
+        for (int p = cd->numPulses() - 1; p >= 0; p--)
          {
-            ind      = &cd->ind[p];
-            nat_tot -= ind->nrecv[nzone+1];
-            if (cd->bInPlace)
+            const gmx_domdec_ind_t &ind = cd->ind[p];
+
+            /* Note: We provision for RVec instead of real, so a factor of 3
+             * more than needed. The buffer actually already has this size
+             * and we typecast, so this works as intended.
+             */
+            DDBufferAccess<gmx::RVec> receiveBufferAccess(comm->rvecBuffer, ind.nsend[nzone + 1]);
+            gmx::ArrayRef<real>       receiveBuffer = realArrayRefFromRvecArrayRef(receiveBufferAccess.buffer);
+            nat_tot -= ind.nrecv[nzone + 1];
+
+            DDBufferAccess<gmx::RVec> sendBufferAccess(comm->rvecBuffer2, cd->receiveInPlace ? 0 : ind.nrecv[nzone + 1]);
+
+            gmx::ArrayRef<real>       sendBuffer;
+            if (cd->receiveInPlace)
              {
-                sbuf = v + nat_tot;
+                sendBuffer = gmx::arrayRefFromArray(v + nat_tot, ind.nrecv[nzone + 1]);
              }
              else
              {
-                sbuf = &comm->vbuf2.v[0][0];
-                j    = 0;
-                for (zone = 0; zone < nzone; zone++)
+                sendBuffer = realArrayRefFromRvecArrayRef(sendBufferAccess.buffer);
+                int j = 0;
+                for (int zone = 0; zone < nzone; zone++)
                  {
-                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
+                    for (int i = ind.cell2at0[zone]; i < ind.cell2at1[zone]; i++)
                      {
-                        sbuf[j] = v[i];
-                        j++;
+                        sendBuffer[j++] = v[i];
                      }
                  }
              }
              /* Communicate the forces */
-            dd_sendrecv_real(dd, d, dddirForward,
-                             sbuf, ind->nrecv[nzone+1],
-                             buf,  ind->nsend[nzone+1]);
-            index = ind->index;
+            ddSendrecv(dd, d, dddirForward,
+                       sendBuffer, receiveBuffer);
              /* Add the received forces */
-            n = 0;
-            for (i = 0; i < ind->nsend[nzone]; i++)
+            int n = 0;
+            for (int g : ind.index)
              {
-                at0 = cgindex[index[i]];
-                at1 = cgindex[index[i]+1];
-                for (j = at0; j < at1; j++)
+                for (int j : atomGrouping.block(g))
                  {
-                    v[j] += buf[n];
+                    v[j] += receiveBuffer[n];
                      n++;
                  }
              }
@@ -761,95 +732,60 @@ static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
              zone->p1_0, zone->p1_1);
  }
  
+/* Using the home grid size as input in cell_ns_x0 and cell_ns_x1
+ * takes the extremes over all home and remote zones in the halo
+ * and returns the results in cell_ns_x0 and cell_ns_x1.
+ * Note: only used with the group cut-off scheme.
+ */
+static void dd_move_cellx(gmx_domdec_t      *dd,
+                          const gmx_ddbox_t *ddbox,
+                          rvec               cell_ns_x0,
+                          rvec               cell_ns_x1)
+{
+    constexpr int      c_ddZoneCommMaxNumZones = 5;
+    gmx_ddzone_t       buf_s[c_ddZoneCommMaxNumZones];
+    gmx_ddzone_t       buf_r[c_ddZoneCommMaxNumZones];
+    gmx_ddzone_t       buf_e[c_ddZoneCommMaxNumZones];
+    gmx_domdec_comm_t *comm = dd->comm;
  
-#define DDZONECOMM_MAXZONE  5
-#define DDZONECOMM_BUFSIZE  3
-
-static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
-                               int ddimind, int direction,
-                               gmx_ddzone_t *buf_s, int n_s,
-                               gmx_ddzone_t *buf_r, int n_r)
-{
-#define ZBS  DDZONECOMM_BUFSIZE
-    rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
-    rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
-    int  i;
-
-    for (i = 0; i < n_s; i++)
+    rvec               extr_s[2];
+    rvec               extr_r[2];
+    for (int d = 1; d < dd->ndim; d++)
      {
-        vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
-        vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
-        vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
-        vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
-        vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
-        vbuf_s[i*ZBS+1][2] = 0;
-        vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
-        vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
-        vbuf_s[i*ZBS+2][2] = 0;
-    }
+        int           dim = dd->dim[d];
+        gmx_ddzone_t &zp  = (d == 1) ? comm->zone_d1[0] : comm->zone_d2[0][0];
  
-    dd_sendrecv_rvec(dd, ddimind, direction,
-                     vbuf_s, n_s*ZBS,
-                     vbuf_r, n_r*ZBS);
-
-    for (i = 0; i < n_r; i++)
-    {
-        buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
-        buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
-        buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
-        buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
-        buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
-        buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
-        buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
+        /* Copy the base sizes of the home zone */
+        zp.min0 = cell_ns_x0[dim];
+        zp.max1 = cell_ns_x1[dim];
+        zp.min1 = cell_ns_x1[dim];
+        zp.mch0 = cell_ns_x0[dim];
+        zp.mch1 = cell_ns_x1[dim];
+        zp.p1_0 = cell_ns_x0[dim];
+        zp.p1_1 = cell_ns_x1[dim];
      }
  
-#undef ZBS
-}
-
-static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
-                          rvec cell_ns_x0, rvec cell_ns_x1)
-{
-    int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
-    gmx_ddzone_t      *zp;
-    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
-    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
-    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
-    rvec               extr_s[2], extr_r[2];
-    rvec               dh;
-    real               dist_d, c = 0, det;
-    gmx_domdec_comm_t *comm;
-    gmx_bool           bPBC, bUse;
-
-    comm = dd->comm;
-
-    for (d = 1; d < dd->ndim; d++)
-    {
-        dim      = dd->dim[d];
-        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
-        zp->min0 = cell_ns_x0[dim];
-        zp->max1 = cell_ns_x1[dim];
-        zp->min1 = cell_ns_x1[dim];
-        zp->mch0 = cell_ns_x0[dim];
-        zp->mch1 = cell_ns_x1[dim];
-        zp->p1_0 = cell_ns_x0[dim];
-        zp->p1_1 = cell_ns_x1[dim];
-    }
+    gmx::ArrayRef<DDCellsizesWithDlb> cellsizes = comm->cellsizesWithDlb;
  
-    for (d = dd->ndim-2; d >= 0; d--)
+    /* Loop backward over the dimensions and aggregate the extremes
+     * of the cell sizes.
+     */
+    for (int d = dd->ndim - 2; d >= 0; d--)
      {
-        dim  = dd->dim[d];
-        bPBC = (dim < ddbox->npbcdim);
+        const int  dim      = dd->dim[d];
+        const bool applyPbc = (dim < ddbox->npbcdim);
  
          /* Use an rvec to store two reals */
-        extr_s[d][0] = comm->cell_f0[d+1];
-        extr_s[d][1] = comm->cell_f1[d+1];
-        extr_s[d][2] = comm->cell_f1[d+1];
+        extr_s[d][0] = cellsizes[d + 1].fracLower;
+        extr_s[d][1] = cellsizes[d + 1].fracUpper;
+        extr_s[d][2] = cellsizes[d + 1].fracUpper;
  
-        pos = 0;
+        int pos = 0;
+        GMX_ASSERT(pos < c_ddZoneCommMaxNumZones, "The buffers should be sufficiently large");
          /* Store the extremes in the backward sending buffer,
-         * so the get updated separately from the forward communication.
+         * so they get updated separately from the forward communication.
           */
-        for (d1 = d; d1 < dd->ndim-1; d1++)
+        for (int d1 = d; d1 < dd->ndim-1; d1++)
          {
              /* We invert the order to be able to use the same loop for buf_e */
              buf_s[pos].min0 = extr_s[d1][1];
@@ -877,11 +813,12 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
          /* We only need to communicate the extremes
           * in the forward direction
           */
-        npulse = comm->cd[d].np;
-        if (bPBC)
+        int numPulses = comm->cd[d].numPulses();
+        int numPulsesMin;
+        if (applyPbc)
          {
              /* Take the minimum to avoid double communication */
-            npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
+            numPulsesMin = std::min(numPulses, dd->nc[dim] - 1 - numPulses);
          }
          else
          {
@@ -890,20 +827,21 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
               * the communication setup and therefore we simply
               * do all communication, but ignore some data.
               */
-            npulse_min = npulse;
+            numPulsesMin = numPulses;
          }
-        for (p = 0; p < npulse_min; p++)
+        for (int pulse = 0; pulse < numPulsesMin; pulse++)
          {
              /* Communicate the extremes forward */
-            bUse = (bPBC || dd->ci[dim] > 0);
+            bool receiveValidData = (applyPbc || dd->ci[dim] > 0);
  
-            dd_sendrecv_rvec(dd, d, dddirForward,
-                             extr_s+d, dd->ndim-d-1,
-                             extr_r+d, dd->ndim-d-1);
+            int  numElements      = dd->ndim - d - 1;
+            ddSendrecv(dd, d, dddirForward,
+                       extr_s + d, numElements,
+                       extr_r + d, numElements);
  
-            if (bUse)
+            if (receiveValidData)
              {
-                for (d1 = d; d1 < dd->ndim-1; d1++)
+                for (int d1 = d; d1 < dd->ndim - 1; d1++)
                  {
                      extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
                      extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
@@ -912,27 +850,31 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
              }
          }
  
-        buf_size = pos;
-        for (p = 0; p < npulse; p++)
+        const int numElementsInBuffer = pos;
+        for (int pulse = 0; pulse < numPulses; pulse++)
          {
              /* Communicate all the zone information backward */
-            bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
+            bool receiveValidData = (applyPbc || dd->ci[dim] < dd->nc[dim] - 1);
  
-            dd_sendrecv_ddzone(dd, d, dddirBackward,
-                               buf_s, buf_size,
-                               buf_r, buf_size);
+            static_assert(sizeof(gmx_ddzone_t) == c_ddzoneNumReals*sizeof(real), "Here we expect gmx_ddzone_t to consist of c_ddzoneNumReals reals (only)");
  
-            clear_rvec(dh);
-            if (p > 0)
+            int numReals = numElementsInBuffer*c_ddzoneNumReals;
+            ddSendrecv(dd, d, dddirBackward,
+                       gmx::arrayRefFromArray(&buf_s[0].min0, numReals),
+                       gmx::arrayRefFromArray(&buf_r[0].min0, numReals));
+
+            rvec dh = { 0 };
+            if (pulse > 0)
              {
-                for (d1 = d+1; d1 < dd->ndim; d1++)
+                for (int d1 = d + 1; d1 < dd->ndim; d1++)
                  {
                      /* Determine the decrease of maximum required
                       * communication height along d1 due to the distance along d,
                       * this avoids a lot of useless atom communication.
                       */
-                    dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
+                    real dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
  
+                    int  c;
                      if (ddbox->tric_dir[dim])
                      {
                          /* c is the off-diagonal coupling between the cell planes
@@ -944,7 +886,7 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
                      {
                          c = 0;
                      }
-                    det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
+                    real det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
                      if (det > 0)
                      {
                          dh[d1] = comm->cutoff - (c*dist_d + std::sqrt(det))/(1 + c*c);
@@ -958,22 +900,23 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
              }
  
              /* Accumulate the extremes over all pulses */
-            for (i = 0; i < buf_size; i++)
+            for (int i = 0; i < numElementsInBuffer; i++)
              {
-                if (p == 0)
+                if (pulse == 0)
                  {
                      buf_e[i] = buf_r[i];
                  }
                  else
                  {
-                    if (bUse)
+                    if (receiveValidData)
                      {
                          buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
                          buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
                          buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
                      }
  
-                    if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
+                    int d1;
+                    if (dd->ndim == 3 && d == 0 && i == numElementsInBuffer - 1)
                      {
                          d1 = 1;
                      }
@@ -981,7 +924,7 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
                      {
                          d1 = d + 1;
                      }
-                    if (bUse && dh[d1] >= 0)
+                    if (receiveValidData && dh[d1] >= 0)
                      {
                          buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
                          buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
@@ -992,13 +935,13 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
                   */
                  buf_s[i] = buf_r[i];
              }
-            if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
-                (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
+            if (((applyPbc || dd->ci[dim] + numPulses < dd->nc[dim]) && pulse == numPulses - 1) ||
+                (!applyPbc && dd->ci[dim] + 1 + pulse == dd->nc[dim] - 1))
              {
                  /* Store the extremes */
-                pos = 0;
+                int pos = 0;
  
-                for (d1 = d; d1 < dd->ndim-1; d1++)
+                for (int d1 = d; d1 < dd->ndim-1; d1++)
                  {
                      extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
                      extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
@@ -1008,7 +951,7 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
  
                  if (d == 1 || (d == 0 && dd->ndim == 3))
                  {
-                    for (i = d; i < 2; i++)
+                    for (int i = d; i < 2; i++)
                      {
                          comm->zone_d2[1-d][i] = buf_e[pos];
                          pos++;
@@ -1025,8 +968,8 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
  
      if (dd->ndim >= 2)
      {
-        dim = dd->dim[1];
-        for (i = 0; i < 2; i++)
+        int dim = dd->dim[1];
+        for (int i = 0; i < 2; i++)
          {
              if (debug)
              {
@@ -1038,10 +981,10 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
      }
      if (dd->ndim >= 3)
      {
-        dim = dd->dim[2];
-        for (i = 0; i < 2; i++)
+        int dim = dd->dim[2];
+        for (int i = 0; i < 2; i++)
          {
-            for (j = 0; j < 2; j++)
+            for (int j = 0; j < 2; j++)
              {
                  if (debug)
                  {
@@ -1052,3741 +995,1014 @@ static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
              }
          }
      }
-    for (d = 1; d < dd->ndim; d++)
+    for (int d = 1; d < dd->ndim; d++)
      {
-        comm->cell_f_max0[d] = extr_s[d-1][0];
-        comm->cell_f_min1[d] = extr_s[d-1][1];
+        cellsizes[d].fracLowerMax = extr_s[d-1][0];
+        cellsizes[d].fracUpperMin = extr_s[d-1][1];
          if (debug)
          {
              fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
-                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
+                    d, cellsizes[d].fracLowerMax, cellsizes[d].fracUpperMin);
          }
      }
  }
  
-static void dd_collect_cg(gmx_domdec_t  *dd,
-                          const t_state *state_local)
+static void write_dd_grid_pdb(const char *fn, int64_t step,
+                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
  {
-    gmx_domdec_master_t *ma = nullptr;
-    int                  buf2[2], *ibuf, i, ncg_home = 0, nat_home = 0;
-
-    if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
-    {
-        /* The master has the correct distribution */
-        return;
-    }
-
-    const int *cg;
-
-    if (state_local->ddp_count == dd->ddp_count)
-    {
-        /* The local state and DD are in sync, use the DD indices */
-        ncg_home = dd->ncg_home;
-        cg       = dd->index_gl;
-        nat_home = dd->nat_home;
-    }
-    else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
-    {
-        /* The DD is out of sync with the local state, but we have stored
-         * the cg indices with the local state, so we can use those.
-         */
-        t_block *cgs_gl;
-
-        cgs_gl = &dd->comm->cgs_gl;
+    rvec   grid_s[2], *grid_r = nullptr, cx, r;
+    char   fname[STRLEN], buf[22];
+    FILE  *out;
+    int    a, i, d, z, y, x;
+    matrix tric;
+    real   vol;
  
-        ncg_home = state_local->cg_gl.size();
-        cg       = state_local->cg_gl.data();
-        nat_home = 0;
-        for (i = 0; i < ncg_home; i++)
-        {
-            nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
-        }
-    }
-    else
-    {
-        gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
-    }
+    copy_rvec(dd->comm->cell_x0, grid_s[0]);
+    copy_rvec(dd->comm->cell_x1, grid_s[1]);
  
-    buf2[0] = ncg_home;
-    buf2[1] = nat_home;
      if (DDMASTER(dd))
      {
-        ma   = dd->ma;
-        ibuf = ma->ibuf;
-    }
-    else
-    {
-        ibuf = nullptr;
+        snew(grid_r, 2*dd->nnodes);
      }
-    /* Collect the charge group and atom counts on the master */
-    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
+
+    dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
  
      if (DDMASTER(dd))
      {
-        ma->index[0] = 0;
-        for (i = 0; i < dd->nnodes; i++)
-        {
-            ma->ncg[i]     = ma->ibuf[2*i];
-            ma->nat[i]     = ma->ibuf[2*i+1];
-            ma->index[i+1] = ma->index[i] + ma->ncg[i];
-
-        }
-        /* Make byte counts and indices */
-        for (i = 0; i < dd->nnodes; i++)
-        {
-            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
-            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
-        }
-        if (debug)
+        for (d = 0; d < DIM; d++)
          {
-            fprintf(debug, "Initial charge group distribution: ");
-            for (i = 0; i < dd->nnodes; i++)
+            for (i = 0; i < DIM; i++)
              {
-                fprintf(debug, " %d", ma->ncg[i]);
+                if (d == i)
+                {
+                    tric[d][i] = 1;
+                }
+                else
+                {
+                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
+                    {
+                        tric[d][i] = box[i][d]/box[i][i];
+                    }
+                    else
+                    {
+                        tric[d][i] = 0;
+                    }
+                }
              }
-            fprintf(debug, "\n");
          }
-    }
-
-    /* Collect the charge group indices on the master */
-    dd_gatherv(dd,
-               ncg_home*sizeof(int), cg,
-               DDMASTER(dd) ? ma->ibuf : nullptr,
-               DDMASTER(dd) ? ma->ibuf+dd->nnodes : nullptr,
-               DDMASTER(dd) ? ma->cg : nullptr);
-
-    dd->comm->master_cg_ddp_count = state_local->ddp_count;
-}
-
-static void dd_collect_vec_sendrecv(gmx_domdec_t                  *dd,
-                                    gmx::ArrayRef<const gmx::RVec> lv,
-                                    gmx::ArrayRef<gmx::RVec>       v)
-{
-    gmx_domdec_master_t *ma;
-    int                  n, i, c, a, nalloc = 0;
-    rvec                *buf = nullptr;
-    t_block             *cgs_gl;
-
-    ma = dd->ma;
-
-    if (!DDMASTER(dd))
-    {
-#if GMX_MPI
-        MPI_Send(const_cast<void *>(static_cast<const void *>(lv.data())), dd->nat_home*sizeof(rvec), MPI_BYTE,
-                 DDMASTERRANK(dd), dd->rank, dd->mpi_comm_all);
-#endif
-    }
-    else
-    {
-        /* Copy the master coordinates to the global array */
-        cgs_gl = &dd->comm->cgs_gl;
-
-        n = DDMASTERRANK(dd);
-        a = 0;
-        for (i = ma->index[n]; i < ma->index[n+1]; i++)
+        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
+        out = gmx_fio_fopen(fname, "w");
+        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
+        a = 1;
+        for (i = 0; i < dd->nnodes; i++)
          {
-            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
+            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
+            for (d = 0; d < DIM; d++)
              {
-                copy_rvec(lv[a++], v[c]);
+                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
              }
-        }
-
-        for (n = 0; n < dd->nnodes; n++)
-        {
-            if (n != dd->rank)
+            for (z = 0; z < 2; z++)
              {
-                if (ma->nat[n] > nalloc)
+                for (y = 0; y < 2; y++)
                  {
-                    nalloc = over_alloc_dd(ma->nat[n]);
-                    srenew(buf, nalloc);
+                    for (x = 0; x < 2; x++)
+                    {
+                        cx[XX] = grid_r[i*2+x][XX];
+                        cx[YY] = grid_r[i*2+y][YY];
+                        cx[ZZ] = grid_r[i*2+z][ZZ];
+                        mvmul(tric, cx, r);
+                        gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
+                                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
+                    }
                  }
-#if GMX_MPI
-                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
-                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
-#endif
-                a = 0;
-                for (i = ma->index[n]; i < ma->index[n+1]; i++)
+            }
+            for (d = 0; d < DIM; d++)
+            {
+                for (x = 0; x < 4; x++)
                  {
-                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
+                    switch (d)
                      {
-                        copy_rvec(buf[a++], v[c]);
+                        case 0: y = 1 + i*8 + 2*x; break;
+                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
+                        case 2: y = 1 + i*8 + x; break;
                      }
+                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
                  }
              }
          }
-        sfree(buf);
+        gmx_fio_fclose(out);
+        sfree(grid_r);
      }
  }
  
-static void get_commbuffer_counts(gmx_domdec_t *dd,
-                                  int **counts, int **disps)
+void write_dd_pdb(const char *fn, int64_t step, const char *title,
+                  const gmx_mtop_t *mtop, const t_commrec *cr,
+                  int natoms, const rvec x[], const matrix box)
  {
-    gmx_domdec_master_t *ma;
-    int                  n;
-
-    ma = dd->ma;
+    char          fname[STRLEN], buf[22];
+    FILE         *out;
+    int           resnr;
+    const char   *atomname, *resname;
+    gmx_domdec_t *dd;
  
-    /* Make the rvec count and displacment arrays */
-    *counts  = ma->ibuf;
-    *disps   = ma->ibuf + dd->nnodes;
-    for (n = 0; n < dd->nnodes; n++)
+    dd = cr->dd;
+    if (natoms == -1)
      {
-        (*counts)[n] = ma->nat[n]*sizeof(rvec);
-        (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
+        natoms = dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites);
      }
-}
-
-static void dd_collect_vec_gatherv(gmx_domdec_t                  *dd,
-                                   gmx::ArrayRef<const gmx::RVec> lv,
-                                   gmx::ArrayRef<gmx::RVec>       v)
-{
-    gmx_domdec_master_t *ma;
-    int                 *rcounts = nullptr, *disps = nullptr;
-    int                  n, i, c, a;
-    rvec                *buf = nullptr;
-    t_block             *cgs_gl;
-
-    ma = dd->ma;
-
-    if (DDMASTER(dd))
-    {
-        get_commbuffer_counts(dd, &rcounts, &disps);
  
-        buf = ma->vbuf;
-    }
+    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
  
-    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv.data(), rcounts, disps, buf);
+    out = gmx_fio_fopen(fname, "w");
  
-    if (DDMASTER(dd))
+    fprintf(out, "TITLE     %s\n", title);
+    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
+    int molb = 0;
+    for (int i = 0; i < natoms; i++)
      {
-        cgs_gl = &dd->comm->cgs_gl;
-
-        a = 0;
-        for (n = 0; n < dd->nnodes; n++)
+        int  ii = dd->globalAtomIndices[i];
+        mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
+        int  c;
+        real b;
+        if (i < dd->comm->atomRanges.end(DDAtomRanges::Type::Zones))
          {
-            for (i = ma->index[n]; i < ma->index[n+1]; i++)
+            c = 0;
+            while (i >= dd->atomGrouping().subRange(0, dd->comm->zones.cg_range[c + 1]).end())
              {
-                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
-                {
-                    copy_rvec(buf[a++], v[c]);
-                }
+                c++;
              }
+            b = c;
          }
+        else if (i < dd->comm->atomRanges.end(DDAtomRanges::Type::Vsites))
+        {
+            b = dd->comm->zones.n;
+        }
+        else
+        {
+            b = dd->comm->zones.n + 1;
+        }
+        gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
+                                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
      }
-}
-
-void dd_collect_vec(gmx_domdec_t                  *dd,
-                    const t_state                 *state_local,
-                    gmx::ArrayRef<const gmx::RVec> lv,
-                    gmx::ArrayRef<gmx::RVec>       v)
-{
-    dd_collect_cg(dd, state_local);
+    fprintf(out, "TER\n");
  
-    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
-    {
-        dd_collect_vec_sendrecv(dd, lv, v);
-    }
-    else
-    {
-        dd_collect_vec_gatherv(dd, lv, v);
-    }
+    gmx_fio_fclose(out);
  }
  
-
-void dd_collect_state(gmx_domdec_t *dd,
-                      const t_state *state_local, t_state *state)
+real dd_cutoff_multibody(const gmx_domdec_t *dd)
  {
-    int nh = state_local->nhchainlength;
+    gmx_domdec_comm_t *comm;
+    int                di;
+    real               r;
  
-    if (DDMASTER(dd))
-    {
-        GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match");
+    comm = dd->comm;
  
-        for (int i = 0; i < efptNR; i++)
+    r = -1;
+    if (comm->bInterCGBondeds)
+    {
+        if (comm->cutoff_mbody > 0)
          {
-            state->lambda[i] = state_local->lambda[i];
+            r = comm->cutoff_mbody;
          }
-        state->fep_state = state_local->fep_state;
-        state->veta      = state_local->veta;
-        state->vol0      = state_local->vol0;
-        copy_mat(state_local->box, state->box);
-        copy_mat(state_local->boxv, state->boxv);
-        copy_mat(state_local->svir_prev, state->svir_prev);
-        copy_mat(state_local->fvir_prev, state->fvir_prev);
-        copy_mat(state_local->pres_prev, state->pres_prev);
-
-        for (int i = 0; i < state_local->ngtc; i++)
+        else
          {
-            for (int j = 0; j < nh; j++)
+            /* cutoff_mbody=0 means we do not have DLB */
+            r = comm->cellsize_min[dd->dim[0]];
+            for (di = 1; di < dd->ndim; di++)
              {
-                state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
-                state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
+                r = std::min(r, comm->cellsize_min[dd->dim[di]]);
              }
-            state->therm_integral[i] = state_local->therm_integral[i];
-        }
-        for (int i = 0; i < state_local->nnhpres; i++)
-        {
-            for (int j = 0; j < nh; j++)
+            if (comm->bBondComm)
+            {
+                r = std::max(r, comm->cutoff_mbody);
+            }
+            else
              {
-                state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
-                state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
+                r = std::min(r, comm->cutoff);
              }
          }
-        state->baros_integral = state_local->baros_integral;
-    }
-    if (state_local->flags & (1 << estX))
-    {
-        gmx::ArrayRef<gmx::RVec> globalXRef = state ? gmx::makeArrayRef(state->x) : gmx::EmptyArrayRef();
-        dd_collect_vec(dd, state_local, state_local->x, globalXRef);
-    }
-    if (state_local->flags & (1 << estV))
-    {
-        gmx::ArrayRef<gmx::RVec> globalVRef = state ? gmx::makeArrayRef(state->v) : gmx::EmptyArrayRef();
-        dd_collect_vec(dd, state_local, state_local->v, globalVRef);
-    }
-    if (state_local->flags & (1 << estCGP))
-    {
-        gmx::ArrayRef<gmx::RVec> globalCgpRef = state ? gmx::makeArrayRef(state->cg_p) : gmx::EmptyArrayRef();
-        dd_collect_vec(dd, state_local, state_local->cg_p, globalCgpRef);
      }
+
+    return r;
  }
  
-static void dd_resize_state(t_state *state, PaddedRVecVector *f, int natoms)
+real dd_cutoff_twobody(const gmx_domdec_t *dd)
  {
-    if (debug)
-    {
-        fprintf(debug, "Resizing state: currently %d, required %d\n", state->natoms, natoms);
-    }
+    real r_mb;
  
-    state_change_natoms(state, natoms);
+    r_mb = dd_cutoff_multibody(dd);
  
-    if (f != nullptr)
-    {
-        /* We need to allocate one element extra, since we might use
-         * (unaligned) 4-wide SIMD loads to access rvec entries.
-         */
-        f->resize(paddedRVecVectorSize(natoms));
-    }
+    return std::max(dd->comm->cutoff, r_mb);
  }
  
-static void dd_check_alloc_ncg(t_forcerec       *fr,
-                               t_state          *state,
-                               PaddedRVecVector *f,
-                               int               numChargeGroups)
+
+static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
+                                   ivec coord_pme)
  {
-    if (numChargeGroups > fr->cg_nalloc)
-    {
-        if (debug)
-        {
-            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, numChargeGroups, over_alloc_dd(numChargeGroups));
-        }
-        fr->cg_nalloc = over_alloc_dd(numChargeGroups);
-        srenew(fr->cginfo, fr->cg_nalloc);
-        if (fr->cutoff_scheme == ecutsGROUP)
-        {
-            srenew(fr->cg_cm, fr->cg_nalloc);
-        }
-    }
-    if (fr->cutoff_scheme == ecutsVERLET)
-    {
-        /* We don't use charge groups, we use x in state to set up
-         * the atom communication.
-         */
-        dd_resize_state(state, f, numChargeGroups);
-    }
+    int nc, ntot;
+
+    nc   = dd->nc[dd->comm->cartpmedim];
+    ntot = dd->comm->ntot[dd->comm->cartpmedim];
+    copy_ivec(coord, coord_pme);
+    coord_pme[dd->comm->cartpmedim] =
+        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
  }
  
-static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
-                                       const rvec *v, rvec *lv)
+static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
  {
-    gmx_domdec_master_t *ma;
-    int                  n, i, c, a, nalloc = 0;
-    rvec                *buf = nullptr;
-
-    if (DDMASTER(dd))
-    {
-        ma  = dd->ma;
+    int npp, npme;
  
-        for (n = 0; n < dd->nnodes; n++)
-        {
-            if (n != dd->rank)
-            {
-                if (ma->nat[n] > nalloc)
-                {
-                    nalloc = over_alloc_dd(ma->nat[n]);
-                    srenew(buf, nalloc);
-                }
-                /* Use lv as a temporary buffer */
-                a = 0;
-                for (i = ma->index[n]; i < ma->index[n+1]; i++)
-                {
-                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
-                    {
-                        copy_rvec(v[c], buf[a++]);
-                    }
-                }
-                if (a != ma->nat[n])
-                {
-                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
-                              a, ma->nat[n]);
-                }
+    npp  = dd->nnodes;
+    npme = dd->comm->npmenodes;
  
-#if GMX_MPI
-                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
-                         DDRANK(dd, n), n, dd->mpi_comm_all);
-#endif
-            }
-        }
-        sfree(buf);
-        n = DDMASTERRANK(dd);
-        a = 0;
-        for (i = ma->index[n]; i < ma->index[n+1]; i++)
-        {
-            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
-            {
-                copy_rvec(v[c], lv[a++]);
-            }
-        }
-    }
-    else
-    {
-#if GMX_MPI
-        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
-                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
-#endif
-    }
+    /* Here we assign a PME node to communicate with this DD node
+     * by assuming that the major index of both is x.
+     * We add cr->npmenodes/2 to obtain an even distribution.
+     */
+    return (ddindex*npme + npme/2)/npp;
  }
  
-static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
-                                       const rvec *v, rvec *lv)
+static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
  {
-    gmx_domdec_master_t *ma;
-    int                 *scounts = nullptr, *disps = nullptr;
-    int                  n, i, c, a;
-    rvec                *buf = nullptr;
+    int *pme_rank;
+    int  n, i, p0, p1;
  
-    if (DDMASTER(dd))
+    snew(pme_rank, dd->comm->npmenodes);
+    n = 0;
+    for (i = 0; i < dd->nnodes; i++)
      {
-        ma  = dd->ma;
-
-        get_commbuffer_counts(dd, &scounts, &disps);
-
-        buf = ma->vbuf;
-        a   = 0;
-        for (n = 0; n < dd->nnodes; n++)
+        p0 = ddindex2pmeindex(dd, i);
+        p1 = ddindex2pmeindex(dd, i+1);
+        if (i+1 == dd->nnodes || p1 > p0)
          {
-            for (i = ma->index[n]; i < ma->index[n+1]; i++)
+            if (debug)
              {
-                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
-                {
-                    copy_rvec(v[c], buf[a++]);
-                }
+                fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
              }
+            pme_rank[n] = i + 1 + n;
+            n++;
          }
      }
  
-    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
+    return pme_rank;
  }
  
-static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs,
-                              const rvec *v, rvec *lv)
+static int gmx_ddcoord2pmeindex(const t_commrec *cr, int x, int y, int z)
  {
-    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
-    {
-        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
-    }
-    else
-    {
-        dd_distribute_vec_scatterv(dd, cgs, v, lv);
-    }
+    gmx_domdec_t *dd;
+    ivec          coords;
+    int           slab;
+
+    dd = cr->dd;
+    /*
+       if (dd->comm->bCartesian) {
+       gmx_ddindex2xyz(dd->nc,ddindex,coords);
+       dd_coords2pmecoords(dd,coords,coords_pme);
+       copy_ivec(dd->ntot,nc);
+       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
+       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
+
+       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
+       } else {
+       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
+       }
+     */
+    coords[XX] = x;
+    coords[YY] = y;
+    coords[ZZ] = z;
+    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
+
+    return slab;
  }
  
-static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
+static int ddcoord2simnodeid(const t_commrec *cr, int x, int y, int z)
  {
-    if (dfhist == nullptr)
-    {
-        return;
-    }
+    gmx_domdec_comm_t *comm;
+    ivec               coords;
+    int                ddindex, nodeid = -1;
  
-    dd_bcast(dd, sizeof(int), &dfhist->bEquil);
-    dd_bcast(dd, sizeof(int), &dfhist->nlambda);
-    dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
+    comm = cr->dd->comm;
  
-    if (dfhist->nlambda > 0)
+    coords[XX] = x;
+    coords[YY] = y;
+    coords[ZZ] = z;
+    if (comm->bCartesianPP_PME)
      {
-        int nlam = dfhist->nlambda;
-        dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
-        dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
-        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
-        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
-        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
-        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
-
-        for (int i = 0; i < nlam; i++)
+#if GMX_MPI
+        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
+#endif
+    }
+    else
+    {
+        ddindex = dd_index(cr->dd->nc, coords);
+        if (comm->bCartesianPP)
+        {
+            nodeid = comm->ddindex2simnodeid[ddindex];
+        }
+        else
          {
-            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
-            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
-            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
-            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
-            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
-            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
+            if (comm->pmenodes)
+            {
+                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
+            }
+            else
+            {
+                nodeid = ddindex;
+            }
          }
      }
+
+    return nodeid;
  }
  
-static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
-                                t_state *state, t_state *state_local,
-                                PaddedRVecVector *f)
+static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
+                              const t_commrec gmx_unused *cr,
+                              int                         sim_nodeid)
  {
-    int nh = state_local->nhchainlength;
+    int pmenode = -1;
  
-    if (DDMASTER(dd))
-    {
-        GMX_RELEASE_ASSERT(state->nhchainlength == nh, "The global and local Nose-Hoover chain lengths should match");
+    const gmx_domdec_comm_t *comm = dd->comm;
  
-        for (int i = 0; i < efptNR; i++)
+    /* This assumes a uniform x domain decomposition grid cell size */
+    if (comm->bCartesianPP_PME)
+    {
+#if GMX_MPI
+        ivec coord, coord_pme;
+        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
+        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
          {
-            state_local->lambda[i] = state->lambda[i];
+            /* This is a PP node */
+            dd_cart_coord2pmecoord(dd, coord, coord_pme);
+            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
          }
-        state_local->fep_state = state->fep_state;
-        state_local->veta      = state->veta;
-        state_local->vol0      = state->vol0;
-        copy_mat(state->box, state_local->box);
-        copy_mat(state->box_rel, state_local->box_rel);
-        copy_mat(state->boxv, state_local->boxv);
-        copy_mat(state->svir_prev, state_local->svir_prev);
-        copy_mat(state->fvir_prev, state_local->fvir_prev);
-        if (state->dfhist != nullptr)
+#endif
+    }
+    else if (comm->bCartesianPP)
+    {
+        if (sim_nodeid < dd->nnodes)
          {
-            copy_df_history(state_local->dfhist, state->dfhist);
+            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
          }
-        for (int i = 0; i < state_local->ngtc; i++)
+    }
+    else
+    {
+        /* This assumes DD cells with identical x coordinates
+         * are numbered sequentially.
+         */
+        if (dd->comm->pmenodes == nullptr)
          {
-            for (int j = 0; j < nh; j++)
+            if (sim_nodeid < dd->nnodes)
              {
-                state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
-                state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
+                /* The DD index equals the nodeid */
+                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
              }
-            state_local->therm_integral[i] = state->therm_integral[i];
          }
-        for (int i = 0; i < state_local->nnhpres; i++)
+        else
          {
-            for (int j = 0; j < nh; j++)
+            int i = 0;
+            while (sim_nodeid > dd->comm->pmenodes[i])
+            {
+                i++;
+            }
+            if (sim_nodeid < dd->comm->pmenodes[i])
              {
-                state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
-                state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
+                pmenode = dd->comm->pmenodes[i];
              }
          }
-        state_local->baros_integral = state->baros_integral;
      }
-    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda.data());
-    dd_bcast(dd, sizeof(int), &state_local->fep_state);
-    dd_bcast(dd, sizeof(real), &state_local->veta);
-    dd_bcast(dd, sizeof(real), &state_local->vol0);
-    dd_bcast(dd, sizeof(state_local->box), state_local->box);
-    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
-    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
-    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
-    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
-    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi.data());
-    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi.data());
-    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral.data());
-    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi.data());
-    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi.data());
-
-    /* communicate df_history -- required for restarting from checkpoint */
-    dd_distribute_dfhist(dd, state_local->dfhist);
-
-    dd_resize_state(state_local, f, dd->nat_home);
  
-    if (state_local->flags & (1 << estX))
-    {
-        const rvec *xGlobal = (DDMASTER(dd) ? as_rvec_array(state->x.data()) : nullptr);
-        dd_distribute_vec(dd, cgs, xGlobal, as_rvec_array(state_local->x.data()));
-    }
-    if (state_local->flags & (1 << estV))
-    {
-        const rvec *vGlobal = (DDMASTER(dd) ? as_rvec_array(state->v.data()) : nullptr);
-        dd_distribute_vec(dd, cgs, vGlobal, as_rvec_array(state_local->v.data()));
-    }
-    if (state_local->flags & (1 << estCGP))
-    {
-        const rvec *cgpGlobal = (DDMASTER(dd) ? as_rvec_array(state->cg_p.data()) : nullptr);
-        dd_distribute_vec(dd, cgs, cgpGlobal, as_rvec_array(state_local->cg_p.data()));
-    }
+    return pmenode;
  }
  
-static char dim2char(int dim)
+NumPmeDomains getNumPmeDomains(const gmx_domdec_t *dd)
  {
-    char c = '?';
-
-    switch (dim)
+    if (dd != nullptr)
      {
-        case XX: c = 'X'; break;
-        case YY: c = 'Y'; break;
-        case ZZ: c = 'Z'; break;
-        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
+        return {
+                   dd->comm->npmenodes_x, dd->comm->npmenodes_y
+        };
+    }
+    else
+    {
+        return {
+                   1, 1
+        };
      }
-
-    return c;
  }
  
-static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
-                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
+std::vector<int> get_pme_ddranks(const t_commrec *cr, int pmenodeid)
  {
-    rvec   grid_s[2], *grid_r = nullptr, cx, r;
-    char   fname[STRLEN], buf[22];
-    FILE  *out;
-    int    a, i, d, z, y, x;
-    matrix tric;
-    real   vol;
-
-    copy_rvec(dd->comm->cell_x0, grid_s[0]);
-    copy_rvec(dd->comm->cell_x1, grid_s[1]);
+    gmx_domdec_t *dd;
+    int           x, y, z;
+    ivec          coord, coord_pme;
  
-    if (DDMASTER(dd))
-    {
-        snew(grid_r, 2*dd->nnodes);
-    }
+    dd = cr->dd;
  
-    dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : nullptr);
+    std::vector<int> ddranks;
+    ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
  
-    if (DDMASTER(dd))
+    for (x = 0; x < dd->nc[XX]; x++)
      {
-        for (d = 0; d < DIM; d++)
+        for (y = 0; y < dd->nc[YY]; y++)
          {
-            for (i = 0; i < DIM; i++)
+            for (z = 0; z < dd->nc[ZZ]; z++)
              {
-                if (d == i)
+                if (dd->comm->bCartesianPP_PME)
                  {
-                    tric[d][i] = 1;
+                    coord[XX] = x;
+                    coord[YY] = y;
+                    coord[ZZ] = z;
+                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
+                    if (dd->ci[XX] == coord_pme[XX] &&
+                        dd->ci[YY] == coord_pme[YY] &&
+                        dd->ci[ZZ] == coord_pme[ZZ])
+                    {
+                        ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
+                    }
                  }
                  else
                  {
-                    if (d < ddbox->npbcdim && dd->nc[d] > 1)
-                    {
-                        tric[d][i] = box[i][d]/box[i][i];
-                    }
-                    else
+                    /* The slab corresponds to the nodeid in the PME group */
+                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
                      {
-                        tric[d][i] = 0;
+                        ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
                      }
                  }
              }
          }
-        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
-        out = gmx_fio_fopen(fname, "w");
-        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
-        a = 1;
-        for (i = 0; i < dd->nnodes; i++)
+    }
+    return ddranks;
+}
+
+static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
+{
+    gmx_bool bReceive = TRUE;
+
+    if (cr->npmenodes < dd->nnodes)
+    {
+        gmx_domdec_comm_t *comm = dd->comm;
+        if (comm->bCartesianPP_PME)
          {
-            vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
-            for (d = 0; d < DIM; d++)
-            {
-                vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
-            }
-            for (z = 0; z < 2; z++)
+#if GMX_MPI
+            int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
+            ivec coords;
+            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
+            coords[comm->cartpmedim]++;
+            if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
              {
-                for (y = 0; y < 2; y++)
+                int rank;
+                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
+                if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
                  {
-                    for (x = 0; x < 2; x++)
-                    {
-                        cx[XX] = grid_r[i*2+x][XX];
-                        cx[YY] = grid_r[i*2+y][YY];
-                        cx[ZZ] = grid_r[i*2+z][ZZ];
-                        mvmul(tric, cx, r);
-                        gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
-                                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
-                    }
+                    /* This is not the last PP node for pmenode */
+                    bReceive = FALSE;
                  }
              }
-            for (d = 0; d < DIM; d++)
+#else
+            GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
+#endif
+        }
+        else
+        {
+            int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
+            if (cr->sim_nodeid+1 < cr->nnodes &&
+                dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
              {
-                for (x = 0; x < 4; x++)
-                {
-                    switch (d)
-                    {
-                        case 0: y = 1 + i*8 + 2*x; break;
-                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
-                        case 2: y = 1 + i*8 + x; break;
-                    }
-                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
-                }
+                /* This is not the last PP node for pmenode */
+                bReceive = FALSE;
              }
          }
-        gmx_fio_fclose(out);
-        sfree(grid_r);
      }
+
+    return bReceive;
  }
  
-void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
-                  const gmx_mtop_t *mtop, t_commrec *cr,
-                  int natoms, rvec x[], matrix box)
+static void set_zones_ncg_home(gmx_domdec_t *dd)
  {
-    char          fname[STRLEN], buf[22];
-    FILE         *out;
-    int           i, ii, resnr, c;
-    const char   *atomname, *resname;
-    real          b;
-    gmx_domdec_t *dd;
+    gmx_domdec_zones_t *zones;
+    int                 i;
  
-    dd = cr->dd;
-    if (natoms == -1)
+    zones = &dd->comm->zones;
+
+    zones->cg_range[0] = 0;
+    for (i = 1; i < zones->n+1; i++)
      {
-        natoms = dd->comm->nat[ddnatVSITE];
+        zones->cg_range[i] = dd->ncg_home;
      }
+    /* zone_ncg1[0] should always be equal to ncg_home */
+    dd->comm->zone_ncg1[0] = dd->ncg_home;
+}
  
-    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
+static void restoreAtomGroups(gmx_domdec_t *dd,
+                              const int *gcgs_index, const t_state *state)
+{
+    gmx::ArrayRef<const int>  atomGroupsState        = state->cg_gl;
  
-    out = gmx_fio_fopen(fname, "w");
+    std::vector<int>         &globalAtomGroupIndices = dd->globalAtomGroupIndices;
+    gmx::RangePartitioning   &atomGrouping           = dd->atomGrouping_;
  
-    fprintf(out, "TITLE     %s\n", title);
-    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
-    int molb = 0;
-    for (i = 0; i < natoms; i++)
+    globalAtomGroupIndices.resize(atomGroupsState.size());
+    atomGrouping.clear();
+
+    /* Copy back the global charge group indices from state
+     * and rebuild the local charge group to atom index.
+     */
+    for (gmx::index i = 0; i < atomGroupsState.size(); i++)
      {
-        ii = dd->gatindex[i];
-        mtopGetAtomAndResidueName(mtop, ii, &molb, &atomname, &resnr, &resname, nullptr);
-        if (i < dd->comm->nat[ddnatZONE])
-        {
-            c = 0;
-            while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
-            {
-                c++;
-            }
-            b = c;
-        }
-        else if (i < dd->comm->nat[ddnatVSITE])
+        const int atomGroupGlobal  = atomGroupsState[i];
+        const int groupSize        = gcgs_index[atomGroupGlobal + 1] - gcgs_index[atomGroupGlobal];
+        globalAtomGroupIndices[i]  = atomGroupGlobal;
+        atomGrouping.appendBlock(groupSize);
+    }
+
+    dd->ncg_home = atomGroupsState.size();
+    dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, atomGrouping.fullRange().end());
+
+    set_zones_ncg_home(dd);
+}
+
+static void dd_set_cginfo(gmx::ArrayRef<const int> index_gl, int cg0, int cg1,
+                          t_forcerec *fr, char *bLocalCG)
+{
+    cginfo_mb_t *cginfo_mb;
+    int         *cginfo;
+    int          cg;
+
+    if (fr != nullptr)
+    {
+        cginfo_mb = fr->cginfo_mb;
+        cginfo    = fr->cginfo;
+
+        for (cg = cg0; cg < cg1; cg++)
          {
-            b = dd->comm->zones.n;
+            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
          }
-        else
+    }
+
+    if (bLocalCG != nullptr)
+    {
+        for (cg = cg0; cg < cg1; cg++)
          {
-            b = dd->comm->zones.n + 1;
+            bLocalCG[index_gl[cg]] = TRUE;
          }
-        gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
-                                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
      }
-    fprintf(out, "TER\n");
-
-    gmx_fio_fclose(out);
  }
  
-real dd_cutoff_multibody(const gmx_domdec_t *dd)
+static void make_dd_indices(gmx_domdec_t *dd,
+                            const int *gcgs_index, int cg_start)
  {
-    gmx_domdec_comm_t *comm;
-    int                di;
-    real               r;
+    const int                 numZones               = dd->comm->zones.n;
+    const int                *zone2cg                = dd->comm->zones.cg_range;
+    const int                *zone_ncg1              = dd->comm->zone_ncg1;
+    gmx::ArrayRef<const int>  globalAtomGroupIndices = dd->globalAtomGroupIndices;
+    const gmx_bool            bCGs                   = dd->comm->bCGs;
  
-    comm = dd->comm;
+    std::vector<int>         &globalAtomIndices      = dd->globalAtomIndices;
+    gmx_ga2la_t              &ga2la                  = *dd->ga2la;
  
-    r = -1;
-    if (comm->bInterCGBondeds)
+    if (zone2cg[1] != dd->ncg_home)
      {
-        if (comm->cutoff_mbody > 0)
+        gmx_incons("dd->ncg_zone is not up to date");
+    }
+
+    /* Make the local to global and global to local atom index */
+    int a = dd->atomGrouping().subRange(cg_start, cg_start).begin();
+    globalAtomIndices.resize(a);
+    for (int zone = 0; zone < numZones; zone++)
+    {
+        int cg0;
+        if (zone == 0)
          {
-            r = comm->cutoff_mbody;
+            cg0 = cg_start;
          }
          else
          {
-            /* cutoff_mbody=0 means we do not have DLB */
-            r = comm->cellsize_min[dd->dim[0]];
-            for (di = 1; di < dd->ndim; di++)
+            cg0 = zone2cg[zone];
+        }
+        int cg1    = zone2cg[zone+1];
+        int cg1_p1 = cg0 + zone_ncg1[zone];
+
+        for (int cg = cg0; cg < cg1; cg++)
+        {
+            int zone1 = zone;
+            if (cg >= cg1_p1)
              {
-                r = std::min(r, comm->cellsize_min[dd->dim[di]]);
+                /* Signal that this cg is from more than one pulse away */
+                zone1 += numZones;
              }
-            if (comm->bBondComm)
+            int cg_gl = globalAtomGroupIndices[cg];
+            if (bCGs)
              {
-                r = std::max(r, comm->cutoff_mbody);
+                for (int a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
+                {
+                    globalAtomIndices.push_back(a_gl);
+                    ga2la.insert(a_gl, {a, zone1});
+                    a++;
+                }
              }
              else
              {
-                r = std::min(r, comm->cutoff);
+                globalAtomIndices.push_back(cg_gl);
+                ga2la.insert(cg_gl, {a, zone1});
+                a++;
              }
          }
      }
-
-    return r;
  }
  
-real dd_cutoff_twobody(const gmx_domdec_t *dd)
+static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
+                          const char *where)
  {
-    real r_mb;
-
-    r_mb = dd_cutoff_multibody(dd);
+    int nerr = 0;
+    if (bLocalCG == nullptr)
+    {
+        return nerr;
+    }
+    for (size_t i = 0; i < dd->globalAtomGroupIndices.size(); i++)
+    {
+        if (!bLocalCG[dd->globalAtomGroupIndices[i]])
+        {
+            fprintf(stderr,
+                    "DD rank %d, %s: atom group %zu, global atom group %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i + 1, dd->globalAtomGroupIndices[i] + 1, dd->ncg_home);
+            nerr++;
+        }
+    }
+    size_t ngl = 0;
+    for (int i = 0; i < ncg_sys; i++)
+    {
+        if (bLocalCG[i])
+        {
+            ngl++;
+        }
+    }
+    if (ngl != dd->globalAtomGroupIndices.size())
+    {
+        fprintf(stderr, "DD rank %d, %s: In bLocalCG %zu atom groups are marked as local, whereas there are %zu\n", dd->rank, where, ngl, dd->globalAtomGroupIndices.size());
+        nerr++;
+    }
  
-    return std::max(dd->comm->cutoff, r_mb);
+    return nerr;
  }
  
-
-static void dd_cart_coord2pmecoord(const gmx_domdec_t *dd, const ivec coord,
-                                   ivec coord_pme)
+static void check_index_consistency(gmx_domdec_t *dd,
+                                    int natoms_sys, int ncg_sys,
+                                    const char *where)
  {
-    int nc, ntot;
+    int       nerr = 0;
  
-    nc   = dd->nc[dd->comm->cartpmedim];
-    ntot = dd->comm->ntot[dd->comm->cartpmedim];
-    copy_ivec(coord, coord_pme);
-    coord_pme[dd->comm->cartpmedim] =
-        nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
-}
+    const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
  
-static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
-{
-    int npp, npme;
+    if (dd->comm->DD_debug > 1)
+    {
+        std::vector<int> have(natoms_sys);
+        for (int a = 0; a < numAtomsInZones; a++)
+        {
+            int globalAtomIndex = dd->globalAtomIndices[a];
+            if (have[globalAtomIndex] > 0)
+            {
+                fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, globalAtomIndex + 1, have[globalAtomIndex], a+1);
+            }
+            else
+            {
+                have[globalAtomIndex] = a + 1;
+            }
+        }
+    }
  
-    npp  = dd->nnodes;
-    npme = dd->comm->npmenodes;
+    std::vector<int> have(numAtomsInZones);
  
-    /* Here we assign a PME node to communicate with this DD node
-     * by assuming that the major index of both is x.
-     * We add cr->npmenodes/2 to obtain an even distribution.
-     */
-    return (ddindex*npme + npme/2)/npp;
-}
-
-static int *dd_interleaved_pme_ranks(const gmx_domdec_t *dd)
-{
-    int *pme_rank;
-    int  n, i, p0, p1;
-
-    snew(pme_rank, dd->comm->npmenodes);
-    n = 0;
-    for (i = 0; i < dd->nnodes; i++)
+    int              ngl = 0;
+    for (int i = 0; i < natoms_sys; i++)
      {
-        p0 = ddindex2pmeindex(dd, i);
-        p1 = ddindex2pmeindex(dd, i+1);
-        if (i+1 == dd->nnodes || p1 > p0)
+        if (const auto entry = dd->ga2la->find(i))
          {
-            if (debug)
+            const int a = entry->la;
+            if (a >= numAtomsInZones)
              {
-                fprintf(debug, "pme_rank[%d] = %d\n", n, i+1+n);
+                fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, numAtomsInZones);
+                nerr++;
              }
-            pme_rank[n] = i + 1 + n;
-            n++;
+            else
+            {
+                have[a] = 1;
+                if (dd->globalAtomIndices[a] != i)
+                {
+                    fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->globalAtomIndices[a]+1);
+                    nerr++;
+                }
+            }
+            ngl++;
+        }
+    }
+    if (ngl != numAtomsInZones)
+    {
+        fprintf(stderr,
+                "DD rank %d, %s: %d global atom indices, %d local atoms\n",
+                dd->rank, where, ngl, numAtomsInZones);
+    }
+    for (int a = 0; a < numAtomsInZones; a++)
+    {
+        if (have[a] == 0)
+        {
+            fprintf(stderr,
+                    "DD rank %d, %s: local atom %d, global %d has no global index\n",
+                    dd->rank, where, a + 1, dd->globalAtomIndices[a] + 1);
          }
      }
  
-    return pme_rank;
+    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
+
+    if (nerr > 0)
+    {
+        gmx_fatal(FARGS, "DD rank %d, %s: %d atom(group) index inconsistencies",
+                  dd->rank, where, nerr);
+    }
  }
  
-static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
+/* Clear all DD global state indices, starting from \p atomGroupStart and \p atomStart */
+static void clearDDStateIndices(gmx_domdec_t *dd,
+                                int           atomGroupStart,
+                                int           atomStart)
  {
-    gmx_domdec_t *dd;
-    ivec          coords;
-    int           slab;
+    gmx_ga2la_t &ga2la = *dd->ga2la;
  
-    dd = cr->dd;
-    /*
-       if (dd->comm->bCartesian) {
-       gmx_ddindex2xyz(dd->nc,ddindex,coords);
-       dd_coords2pmecoords(dd,coords,coords_pme);
-       copy_ivec(dd->ntot,nc);
-       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
-       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
+    if (atomStart == 0)
+    {
+        /* Clear the whole list without the overhead of searching */
+        ga2la.clear();
+    }
+    else
+    {
+        const int numAtomsInZones = dd->comm->atomRanges.end(DDAtomRanges::Type::Zones);
+        for (int i = 0; i < numAtomsInZones; i++)
+        {
+            ga2la.erase(dd->globalAtomIndices[i]);
+        }
+    }
  
-       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
-       } else {
-       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
-       }
-     */
-    coords[XX] = x;
-    coords[YY] = y;
-    coords[ZZ] = z;
-    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
+    char *bLocalCG = dd->comm->bLocalCG;
+    if (bLocalCG)
+    {
+        for (size_t atomGroup = atomGroupStart; atomGroup < dd->globalAtomGroupIndices.size(); atomGroup++)
+        {
+            bLocalCG[dd->globalAtomGroupIndices[atomGroup]] = FALSE;
+        }
+    }
  
-    return slab;
+    dd_clear_local_vsite_indices(dd);
+
+    if (dd->constraints)
+    {
+        dd_clear_local_constraint_indices(dd);
+    }
  }
  
-static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
+static bool check_grid_jump(int64_t             step,
+                            const gmx_domdec_t *dd,
+                            real                cutoff,
+                            const gmx_ddbox_t  *ddbox,
+                            gmx_bool            bFatal)
  {
-    gmx_domdec_comm_t *comm;
-    ivec               coords;
-    int                ddindex, nodeid = -1;
-
-    comm = cr->dd->comm;
+    gmx_domdec_comm_t *comm    = dd->comm;
+    bool               invalid = false;
  
-    coords[XX] = x;
-    coords[YY] = y;
-    coords[ZZ] = z;
-    if (comm->bCartesianPP_PME)
-    {
-#if GMX_MPI
-        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
-#endif
-    }
-    else
+    for (int d = 1; d < dd->ndim; d++)
      {
-        ddindex = dd_index(cr->dd->nc, coords);
-        if (comm->bCartesianPP)
+        const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
+        const int                 dim       = dd->dim[d];
+        const real                limit     = grid_jump_limit(comm, cutoff, d);
+        real                      bfac      = ddbox->box_size[dim];
+        if (ddbox->tric_dir[dim])
          {
-            nodeid = comm->ddindex2simnodeid[ddindex];
+            bfac *= ddbox->skew_fac[dim];
          }
-        else
+        if ((cellsizes.fracUpper - cellsizes.fracLowerMax)*bfac <  limit ||
+            (cellsizes.fracLower - cellsizes.fracUpperMin)*bfac > -limit)
          {
-            if (comm->pmenodes)
-            {
-                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
-            }
-            else
+            invalid = true;
+
+            if (bFatal)
              {
-                nodeid = ddindex;
+                char buf[22];
+
+                /* This error should never be triggered under normal
+                 * circumstances, but you never know ...
+                 */
+                gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
+                          gmx_step_str(step, buf),
+                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
              }
          }
      }
  
-    return nodeid;
+    return invalid;
  }
  
-static int dd_simnode2pmenode(const gmx_domdec_t         *dd,
-                              const t_commrec gmx_unused *cr,
-                              int                         sim_nodeid)
+static float dd_force_load(gmx_domdec_comm_t *comm)
  {
-    int pmenode = -1;
-
-    const gmx_domdec_comm_t *comm = dd->comm;
+    float load;
  
-    /* This assumes a uniform x domain decomposition grid cell size */
-    if (comm->bCartesianPP_PME)
+    if (comm->eFlop)
      {
-#if GMX_MPI
-        ivec coord, coord_pme;
-        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
-        if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
+        load = comm->flop;
+        if (comm->eFlop > 1)
          {
-            /* This is a PP node */
-            dd_cart_coord2pmecoord(dd, coord, coord_pme);
-            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
+            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
          }
-#endif
      }
-    else if (comm->bCartesianPP)
+    else
      {
-        if (sim_nodeid < dd->nnodes)
+        load = comm->cycl[ddCyclF];
+        if (comm->cycl_n[ddCyclF] > 1)
          {
-            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
+            /* Subtract the maximum of the last n cycle counts
+             * to get rid of possible high counts due to other sources,
+             * for instance system activity, that would otherwise
+             * affect the dynamic load balancing.
+             */
+            load -= comm->cycl_max[ddCyclF];
          }
-    }
-    else
-    {
-        /* This assumes DD cells with identical x coordinates
-         * are numbered sequentially.
-         */
-        if (dd->comm->pmenodes == nullptr)
+
+#if GMX_MPI
+        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
          {
-            if (sim_nodeid < dd->nnodes)
+            float gpu_wait, gpu_wait_sum;
+
+            gpu_wait = comm->cycl[ddCyclWaitGPU];
+            if (comm->cycl_n[ddCyclF] > 1)
              {
-                /* The DD index equals the nodeid */
-                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
+                /* We should remove the WaitGPU time of the same MD step
+                 * as the one with the maximum F time, since the F time
+                 * and the wait time are not independent.
+                 * Furthermore, the step for the max F time should be chosen
+                 * the same on all ranks that share the same GPU.
+                 * But to keep the code simple, we remove the average instead.
+                 * The main reason for artificially long times at some steps
+                 * is spurious CPU activity or MPI time, so we don't expect
+                 * that changes in the GPU wait time matter a lot here.
+                 */
+                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/static_cast<float>(comm->cycl_n[ddCyclF]);
              }
+            /* Sum the wait times over the ranks that share the same GPU */
+            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
+                          comm->mpi_comm_gpu_shared);
+            /* Replace the wait time by the average over the ranks */
+            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
+        }
+#endif
+    }
+
+    return load;
+}
+
+static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
+{
+    gmx_domdec_comm_t *comm;
+    int                i;
+
+    comm = dd->comm;
+
+    snew(*dim_f, dd->nc[dim]+1);
+    (*dim_f)[0] = 0;
+    for (i = 1; i < dd->nc[dim]; i++)
+    {
+        if (comm->slb_frac[dim])
+        {
+            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
          }
          else
          {
-            int i = 0;
-            while (sim_nodeid > dd->comm->pmenodes[i])
-            {
-                i++;
-            }
-            if (sim_nodeid < dd->comm->pmenodes[i])
-            {
-                pmenode = dd->comm->pmenodes[i];
-            }
+            (*dim_f)[i] = static_cast<real>(i)/static_cast<real>(dd->nc[dim]);
          }
      }
-
-    return pmenode;
+    (*dim_f)[dd->nc[dim]] = 1;
  }
  
-void get_pme_nnodes(const gmx_domdec_t *dd,
-                    int *npmenodes_x, int *npmenodes_y)
+static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
  {
-    if (dd != nullptr)
+    int  pmeindex, slab, nso, i;
+    ivec xyz;
+
+    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
      {
-        *npmenodes_x = dd->comm->npmenodes_x;
-        *npmenodes_y = dd->comm->npmenodes_y;
+        ddpme->dim = YY;
      }
      else
      {
-        *npmenodes_x = 1;
-        *npmenodes_y = 1;
+        ddpme->dim = dimind;
      }
-}
-
-std::vector<int> get_pme_ddranks(t_commrec *cr, int pmenodeid)
-{
-    gmx_domdec_t *dd;
-    int           x, y, z;
-    ivec          coord, coord_pme;
+    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
  
-    dd = cr->dd;
+    ddpme->nslab = (ddpme->dim == 0 ?
+                    dd->comm->npmenodes_x :
+                    dd->comm->npmenodes_y);
  
-    std::vector<int> ddranks;
-    ddranks.reserve((dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
+    if (ddpme->nslab <= 1)
+    {
+        return;
+    }
  
-    for (x = 0; x < dd->nc[XX]; x++)
+    nso = dd->comm->npmenodes/ddpme->nslab;
+    /* Determine for each PME slab the PP location range for dimension dim */
+    snew(ddpme->pp_min, ddpme->nslab);
+    snew(ddpme->pp_max, ddpme->nslab);
+    for (slab = 0; slab < ddpme->nslab; slab++)
      {
-        for (y = 0; y < dd->nc[YY]; y++)
-        {
-            for (z = 0; z < dd->nc[ZZ]; z++)
-            {
-                if (dd->comm->bCartesianPP_PME)
-                {
-                    coord[XX] = x;
-                    coord[YY] = y;
-                    coord[ZZ] = z;
-                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
-                    if (dd->ci[XX] == coord_pme[XX] &&
-                        dd->ci[YY] == coord_pme[YY] &&
-                        dd->ci[ZZ] == coord_pme[ZZ])
-                    {
-                        ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
-                    }
-                }
-                else
-                {
-                    /* The slab corresponds to the nodeid in the PME group */
-                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
-                    {
-                        ddranks.push_back(ddcoord2simnodeid(cr, x, y, z));
-                    }
-                }
-            }
-        }
+        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
+        ddpme->pp_max[slab] = 0;
      }
-    return ddranks;
-}
-
-static gmx_bool receive_vir_ener(const gmx_domdec_t *dd, const t_commrec *cr)
-{
-    gmx_bool bReceive = TRUE;
-
-    if (cr->npmenodes < dd->nnodes)
+    for (i = 0; i < dd->nnodes; i++)
      {
-        gmx_domdec_comm_t *comm = dd->comm;
-        if (comm->bCartesianPP_PME)
+        ddindex2xyz(dd->nc, i, xyz);
+        /* For y only use our y/z slab.
+         * This assumes that the PME x grid size matches the DD grid size.
+         */
+        if (dimind == 0 || xyz[XX] == dd->ci[XX])
          {
-#if GMX_MPI
-            int  pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
-            ivec coords;
-            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
-            coords[comm->cartpmedim]++;
-            if (coords[comm->cartpmedim] < dd->nc[comm->cartpmedim])
+            pmeindex = ddindex2pmeindex(dd, i);
+            if (dimind == 0)
              {
-                int rank;
-                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
-                if (dd_simnode2pmenode(dd, cr, rank) == pmenode)
-                {
-                    /* This is not the last PP node for pmenode */
-                    bReceive = FALSE;
-                }
+                slab = pmeindex/nso;
              }
-#else
-            GMX_RELEASE_ASSERT(false, "Without MPI we should not have Cartesian PP-PME with #PMEnodes < #DDnodes");
-#endif
-        }
-        else
-        {
-            int pmenode = dd_simnode2pmenode(dd, cr, cr->sim_nodeid);
-            if (cr->sim_nodeid+1 < cr->nnodes &&
-                dd_simnode2pmenode(dd, cr, cr->sim_nodeid+1) == pmenode)
+            else
              {
-                /* This is not the last PP node for pmenode */
-                bReceive = FALSE;
+                slab = pmeindex % ddpme->nslab;
              }
+            ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
+            ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
          }
      }
  
-    return bReceive;
-}
-
-static void set_zones_ncg_home(gmx_domdec_t *dd)
-{
-    gmx_domdec_zones_t *zones;
-    int                 i;
-
-    zones = &dd->comm->zones;
-
-    zones->cg_range[0] = 0;
-    for (i = 1; i < zones->n+1; i++)
-    {
-        zones->cg_range[i] = dd->ncg_home;
-    }
-    /* zone_ncg1[0] should always be equal to ncg_home */
-    dd->comm->zone_ncg1[0] = dd->ncg_home;
-}
-
-static void rebuild_cgindex(gmx_domdec_t *dd,
-                            const int *gcgs_index, const t_state *state)
-{
-    int * gmx_restrict dd_cg_gl = dd->index_gl;
-    int * gmx_restrict cgindex  = dd->cgindex;
-    int                nat      = 0;
-
-    /* Copy back the global charge group indices from state
-     * and rebuild the local charge group to atom index.
-     */
-    cgindex[0] = nat;
-    for (unsigned int i = 0; i < state->cg_gl.size(); i++)
-    {
-        cgindex[i]  = nat;
-        int cg_gl   = state->cg_gl[i];
-        dd_cg_gl[i] = cg_gl;
-        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
-    }
-    cgindex[state->cg_gl.size()] = nat;
-
-    dd->ncg_home = state->cg_gl.size();
-    dd->nat_home = nat;
-
-    set_zones_ncg_home(dd);
-}
-
-static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
-{
-    while (cg >= cginfo_mb->cg_end)
-    {
-        cginfo_mb++;
-    }
-
-    return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
-}
-
-static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
-                          t_forcerec *fr, char *bLocalCG)
-{
-    cginfo_mb_t *cginfo_mb;
-    int         *cginfo;
-    int          cg;
-
-    if (fr != nullptr)
-    {
-        cginfo_mb = fr->cginfo_mb;
-        cginfo    = fr->cginfo;
-
-        for (cg = cg0; cg < cg1; cg++)
-        {
-            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
-        }
-    }
-
-    if (bLocalCG != nullptr)
-    {
-        for (cg = cg0; cg < cg1; cg++)
-        {
-            bLocalCG[index_gl[cg]] = TRUE;
-        }
-    }
+    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
  }
  
-static void make_dd_indices(gmx_domdec_t *dd,
-                            const int *gcgs_index, int cg_start)
+int dd_pme_maxshift_x(const gmx_domdec_t *dd)
  {
-    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
-    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
-    gmx_bool     bCGs;
-
-    if (dd->nat_tot > dd->gatindex_nalloc)
-    {
-        dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
-        srenew(dd->gatindex, dd->gatindex_nalloc);
-    }
-
-    nzone      = dd->comm->zones.n;
-    zone2cg    = dd->comm->zones.cg_range;
-    zone_ncg1  = dd->comm->zone_ncg1;
-    index_gl   = dd->index_gl;
-    gatindex   = dd->gatindex;
-    bCGs       = dd->comm->bCGs;
-
-    if (zone2cg[1] != dd->ncg_home)
+    if (dd->comm->ddpme[0].dim == XX)
      {
-        gmx_incons("dd->ncg_zone is not up to date");
+        return dd->comm->ddpme[0].maxshift;
      }
-
-    /* Make the local to global and global to local atom index */
-    a = dd->cgindex[cg_start];
-    for (zone = 0; zone < nzone; zone++)
+    else
      {
-        if (zone == 0)
-        {
-            cg0 = cg_start;
-        }
-        else
-        {
-            cg0 = zone2cg[zone];
-        }
-        cg1    = zone2cg[zone+1];
-        cg1_p1 = cg0 + zone_ncg1[zone];
-
-        for (cg = cg0; cg < cg1; cg++)
-        {
-            zone1 = zone;
-            if (cg >= cg1_p1)
-            {
-                /* Signal that this cg is from more than one pulse away */
-                zone1 += nzone;
-            }
-            cg_gl = index_gl[cg];
-            if (bCGs)
-            {
-                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
-                {
-                    gatindex[a] = a_gl;
-                    ga2la_set(dd->ga2la, a_gl, a, zone1);
-                    a++;
-                }
-            }
-            else
-            {
-                gatindex[a] = cg_gl;
-                ga2la_set(dd->ga2la, cg_gl, a, zone1);
-                a++;
-            }
-        }
+        return 0;
      }
  }
  
-static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
-                          const char *where)
+int dd_pme_maxshift_y(const gmx_domdec_t *dd)
  {
-    int i, ngl, nerr;
-
-    nerr = 0;
-    if (bLocalCG == nullptr)
-    {
-        return nerr;
-    }
-    for (i = 0; i < dd->ncg_tot; i++)
+    if (dd->comm->ddpme[0].dim == YY)
      {
-        if (!bLocalCG[dd->index_gl[i]])
-        {
-            fprintf(stderr,
-                    "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
-            nerr++;
-        }
+        return dd->comm->ddpme[0].maxshift;
      }
-    ngl = 0;
-    for (i = 0; i < ncg_sys; i++)
+    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
      {
-        if (bLocalCG[i])
-        {
-            ngl++;
-        }
+        return dd->comm->ddpme[1].maxshift;
      }
-    if (ngl != dd->ncg_tot)
+    else
      {
-        fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
-        nerr++;
+        return 0;
      }
-
-    return nerr;
  }
  
-static void check_index_consistency(gmx_domdec_t *dd,
-                                    int natoms_sys, int ncg_sys,
-                                    const char *where)
+static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
+                                  gmx_ddbox_t *ddbox,
+                                  rvec cell_ns_x0, rvec cell_ns_x1,
+                                  int64_t step)
  {
-    int   nerr, ngl, i, a, cell;
-    int  *have;
-
-    nerr = 0;
-
-    if (dd->comm->DD_debug > 1)
-    {
-        snew(have, natoms_sys);
-        for (a = 0; a < dd->nat_tot; a++)
-        {
-            if (have[dd->gatindex[a]] > 0)
-            {
-                fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
-            }
-            else
-            {
-                have[dd->gatindex[a]] = a + 1;
-            }
-        }
-        sfree(have);
-    }
-
-    snew(have, dd->nat_tot);
-
-    ngl  = 0;
-    for (i = 0; i < natoms_sys; i++)
-    {
-        if (ga2la_get(dd->ga2la, i, &a, &cell))
-        {
-            if (a >= dd->nat_tot)
-            {
-                fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
-                nerr++;
-            }
-            else
-            {
-                have[a] = 1;
-                if (dd->gatindex[a] != i)
-                {
-                    fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
-                    nerr++;
-                }
-            }
-            ngl++;
-        }
-    }
-    if (ngl != dd->nat_tot)
-    {
-        fprintf(stderr,
-                "DD rank %d, %s: %d global atom indices, %d local atoms\n",
-                dd->rank, where, ngl, dd->nat_tot);
-    }
-    for (a = 0; a < dd->nat_tot; a++)
-    {
-        if (have[a] == 0)
-        {
-            fprintf(stderr,
-                    "DD rank %d, %s: local atom %d, global %d has no global index\n",
-                    dd->rank, where, a+1, dd->gatindex[a]+1);
-        }
-    }
-    sfree(have);
-
-    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
-
-    if (nerr > 0)
-    {
-        gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
-                  dd->rank, where, nerr);
-    }
-}
-
-static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
-{
-    int   i;
-    char *bLocalCG;
-
-    if (a_start == 0)
-    {
-        /* Clear the whole list without searching */
-        ga2la_clear(dd->ga2la);
-    }
-    else
-    {
-        for (i = a_start; i < dd->nat_tot; i++)
-        {
-            ga2la_del(dd->ga2la, dd->gatindex[i]);
-        }
-    }
-
-    bLocalCG = dd->comm->bLocalCG;
-    if (bLocalCG)
-    {
-        for (i = cg_start; i < dd->ncg_tot; i++)
-        {
-            bLocalCG[dd->index_gl[i]] = FALSE;
-        }
-    }
-
-    dd_clear_local_vsite_indices(dd);
-
-    if (dd->constraints)
-    {
-        dd_clear_local_constraint_indices(dd);
-    }
-}
-
-/* This function should be used for moving the domain boudaries during DLB,
- * for obtaining the minimum cell size. It checks the initially set limit
- * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
- * and, possibly, a longer cut-off limit set for PME load balancing.
- */
-static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
-{
-    real cellsize_min;
-
-    cellsize_min = comm->cellsize_min[dim];
-
-    if (!comm->bVacDLBNoLimit)
-    {
-        /* The cut-off might have changed, e.g. by PME load balacning,
-         * from the value used to set comm->cellsize_min, so check it.
-         */
-        cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
-
-        if (comm->bPMELoadBalDLBLimits)
-        {
-            /* Check for the cut-off limit set by the PME load balancing */
-            cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
-        }
-    }
-
-    return cellsize_min;
-}
-
-static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
-                            int dim_ind)
-{
-    real grid_jump_limit;
-
-    /* The distance between the boundaries of cells at distance
-     * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
-     * and by the fact that cells should not be shifted by more than
-     * half their size, such that cg's only shift by one cell
-     * at redecomposition.
-     */
-    grid_jump_limit = comm->cellsize_limit;
-    if (!comm->bVacDLBNoLimit)
-    {
-        if (comm->bPMELoadBalDLBLimits)
-        {
-            cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
-        }
-        grid_jump_limit = std::max(grid_jump_limit,
-                                   cutoff/comm->cd[dim_ind].np);
-    }
-
-    return grid_jump_limit;
-}
-
-static gmx_bool check_grid_jump(gmx_int64_t     step,
-                                gmx_domdec_t   *dd,
-                                real            cutoff,
-                                gmx_ddbox_t    *ddbox,
-                                gmx_bool        bFatal)
-{
-    gmx_domdec_comm_t *comm;
-    int                d, dim;
-    real               limit, bfac;
-    gmx_bool           bInvalid;
-
-    bInvalid = FALSE;
-
-    comm = dd->comm;
-
-    for (d = 1; d < dd->ndim; d++)
-    {
-        dim   = dd->dim[d];
-        limit = grid_jump_limit(comm, cutoff, d);
-        bfac  = ddbox->box_size[dim];
-        if (ddbox->tric_dir[dim])
-        {
-            bfac *= ddbox->skew_fac[dim];
-        }
-        if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
-                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
-        {
-            bInvalid = TRUE;
-
-            if (bFatal)
-            {
-                char buf[22];
-
-                /* This error should never be triggered under normal
-                 * circumstances, but you never know ...
-                 */
-                gmx_fatal(FARGS, "step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
-                          gmx_step_str(step, buf),
-                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
-            }
-        }
-    }
-
-    return bInvalid;
-}
-
-static int dd_load_count(gmx_domdec_comm_t *comm)
-{
-    return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
-}
-
-static float dd_force_load(gmx_domdec_comm_t *comm)
-{
-    float load;
-
-    if (comm->eFlop)
-    {
-        load = comm->flop;
-        if (comm->eFlop > 1)
-        {
-            load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
-        }
-    }
-    else
-    {
-        load = comm->cycl[ddCyclF];
-        if (comm->cycl_n[ddCyclF] > 1)
-        {
-            /* Subtract the maximum of the last n cycle counts
-             * to get rid of possible high counts due to other sources,
-             * for instance system activity, that would otherwise
-             * affect the dynamic load balancing.
-             */
-            load -= comm->cycl_max[ddCyclF];
-        }
-
-#if GMX_MPI
-        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
-        {
-            float gpu_wait, gpu_wait_sum;
-
-            gpu_wait = comm->cycl[ddCyclWaitGPU];
-            if (comm->cycl_n[ddCyclF] > 1)
-            {
-                /* We should remove the WaitGPU time of the same MD step
-                 * as the one with the maximum F time, since the F time
-                 * and the wait time are not independent.
-                 * Furthermore, the step for the max F time should be chosen
-                 * the same on all ranks that share the same GPU.
-                 * But to keep the code simple, we remove the average instead.
-                 * The main reason for artificially long times at some steps
-                 * is spurious CPU activity or MPI time, so we don't expect
-                 * that changes in the GPU wait time matter a lot here.
-                 */
-                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
-            }
-            /* Sum the wait times over the ranks that share the same GPU */
-            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
-                          comm->mpi_comm_gpu_shared);
-            /* Replace the wait time by the average over the ranks */
-            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
-        }
-#endif
-    }
-
-    return load;
-}
-
-static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
-{
-    gmx_domdec_comm_t *comm;
-    int                i;
-
-    comm = dd->comm;
-
-    snew(*dim_f, dd->nc[dim]+1);
-    (*dim_f)[0] = 0;
-    for (i = 1; i < dd->nc[dim]; i++)
-    {
-        if (comm->slb_frac[dim])
-        {
-            (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
-        }
-        else
-        {
-            (*dim_f)[i] = (real)i/(real)dd->nc[dim];
-        }
-    }
-    (*dim_f)[dd->nc[dim]] = 1;
-}
-
-static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
-{
-    int  pmeindex, slab, nso, i;
-    ivec xyz;
-
-    if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
-    {
-        ddpme->dim = YY;
-    }
-    else
-    {
-        ddpme->dim = dimind;
-    }
-    ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
-
-    ddpme->nslab = (ddpme->dim == 0 ?
-                    dd->comm->npmenodes_x :
-                    dd->comm->npmenodes_y);
-
-    if (ddpme->nslab <= 1)
-    {
-        return;
-    }
-
-    nso = dd->comm->npmenodes/ddpme->nslab;
-    /* Determine for each PME slab the PP location range for dimension dim */
-    snew(ddpme->pp_min, ddpme->nslab);
-    snew(ddpme->pp_max, ddpme->nslab);
-    for (slab = 0; slab < ddpme->nslab; slab++)
-    {
-        ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
-        ddpme->pp_max[slab] = 0;
-    }
-    for (i = 0; i < dd->nnodes; i++)
-    {
-        ddindex2xyz(dd->nc, i, xyz);
-        /* For y only use our y/z slab.
-         * This assumes that the PME x grid size matches the DD grid size.
-         */
-        if (dimind == 0 || xyz[XX] == dd->ci[XX])
-        {
-            pmeindex = ddindex2pmeindex(dd, i);
-            if (dimind == 0)
-            {
-                slab = pmeindex/nso;
-            }
-            else
-            {
-                slab = pmeindex % ddpme->nslab;
-            }
-            ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
-            ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
-        }
-    }
-
-    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
-}
-
-int dd_pme_maxshift_x(const gmx_domdec_t *dd)
-{
-    if (dd->comm->ddpme[0].dim == XX)
-    {
-        return dd->comm->ddpme[0].maxshift;
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-int dd_pme_maxshift_y(const gmx_domdec_t *dd)
-{
-    if (dd->comm->ddpme[0].dim == YY)
-    {
-        return dd->comm->ddpme[0].maxshift;
-    }
-    else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
-    {
-        return dd->comm->ddpme[1].maxshift;
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
-                             gmx_bool bUniform, const gmx_ddbox_t *ddbox,
-                             const real *cell_f)
-{
-    gmx_domdec_comm_t *comm;
-    int                nc, ns, s;
-    int               *xmin, *xmax;
-    real               range, pme_boundary;
-    int                sh;
-
-    comm = dd->comm;
-    nc   = dd->nc[ddpme->dim];
-    ns   = ddpme->nslab;
-
-    if (!ddpme->dim_match)
-    {
-        /* PP decomposition is not along dim: the worst situation */
-        sh = ns/2;
-    }
-    else if (ns <= 3 || (bUniform && ns == nc))
-    {
-        /* The optimal situation */
-        sh = 1;
-    }
-    else
-    {
-        /* We need to check for all pme nodes which nodes they
-         * could possibly need to communicate with.
-         */
-        xmin = ddpme->pp_min;
-        xmax = ddpme->pp_max;
-        /* Allow for atoms to be maximally 2/3 times the cut-off
-         * out of their DD cell. This is a reasonable balance between
-         * between performance and support for most charge-group/cut-off
-         * combinations.
-         */
-        range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
-        /* Avoid extra communication when we are exactly at a boundary */
-        range *= 0.999;
-
-        sh = 1;
-        for (s = 0; s < ns; s++)
-        {
-            /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
-            pme_boundary = (real)s/ns;
-            while (sh+1 < ns &&
-                   ((s-(sh+1) >= 0 &&
-                     cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
-                    (s-(sh+1) <  0 &&
-                     cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
-            {
-                sh++;
-            }
-            pme_boundary = (real)(s+1)/ns;
-            while (sh+1 < ns &&
-                   ((s+(sh+1) <  ns &&
-                     cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
-                    (s+(sh+1) >= ns &&
-                     cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
-            {
-                sh++;
-            }
-        }
-    }
-
-    ddpme->maxshift = sh;
-
-    if (debug)
-    {
-        fprintf(debug, "PME slab communication range for dim %d is %d\n",
-                ddpme->dim, ddpme->maxshift);
-    }
-}
-
-static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
-{
-    int d, dim;
-
-    for (d = 0; d < dd->ndim; d++)
-    {
-        dim = dd->dim[d];
-        if (dim < ddbox->nboundeddim &&
-            ddbox->box_size[dim]*ddbox->skew_fac[dim] <
-            dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
-        {
-            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
-                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
-                      dd->nc[dim], dd->comm->cellsize_limit);
-        }
-    }
-}
-
-enum {
-    setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
-};
-
-/* Set the domain boundaries. Use for static (or no) load balancing,
- * and also for the starting state for dynamic load balancing.
- * setmode determine if and where the boundaries are stored, use enum above.
- * Returns the number communication pulses in npulse.
- */
-static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, const gmx_ddbox_t *ddbox,
-                                  int setmode, ivec npulse)
-{
-    gmx_domdec_comm_t *comm;
-    int                d, j;
-    rvec               cellsize_min;
-    real              *cell_x, cell_dx, cellsize;
-
-    comm = dd->comm;
-
-    for (d = 0; d < DIM; d++)
-    {
-        cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
-        npulse[d]       = 1;
-        if (dd->nc[d] == 1 || comm->slb_frac[d] == nullptr)
-        {
-            /* Uniform grid */
-            cell_dx = ddbox->box_size[d]/dd->nc[d];
-            switch (setmode)
-            {
-                case setcellsizeslbMASTER:
-                    for (j = 0; j < dd->nc[d]+1; j++)
-                    {
-                        dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
-                    }
-                    break;
-                case setcellsizeslbLOCAL:
-                    comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
-                    comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
-                    break;
-                default:
-                    break;
-            }
-            cellsize = cell_dx*ddbox->skew_fac[d];
-            while (cellsize*npulse[d] < comm->cutoff)
-            {
-                npulse[d]++;
-            }
-            cellsize_min[d] = cellsize;
-        }
-        else
-        {
-            /* Statically load balanced grid */
-            /* Also when we are not doing a master distribution we determine
-             * all cell borders in a loop to obtain identical values
-             * to the master distribution case and to determine npulse.
-             */
-            if (setmode == setcellsizeslbMASTER)
-            {
-                cell_x = dd->ma->cell_x[d];
-            }
-            else
-            {
-                snew(cell_x, dd->nc[d]+1);
-            }
-            cell_x[0] = ddbox->box0[d];
-            for (j = 0; j < dd->nc[d]; j++)
-            {
-                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
-                cell_x[j+1] = cell_x[j] + cell_dx;
-                cellsize    = cell_dx*ddbox->skew_fac[d];
-                while (cellsize*npulse[d] < comm->cutoff &&
-                       npulse[d] < dd->nc[d]-1)
-                {
-                    npulse[d]++;
-                }
-                cellsize_min[d] = std::min(cellsize_min[d], cellsize);
-            }
-            if (setmode == setcellsizeslbLOCAL)
-            {
-                comm->cell_x0[d] = cell_x[dd->ci[d]];
-                comm->cell_x1[d] = cell_x[dd->ci[d]+1];
-            }
-            if (setmode != setcellsizeslbMASTER)
-            {
-                sfree(cell_x);
-            }
-        }
-        /* The following limitation is to avoid that a cell would receive
-         * some of its own home charge groups back over the periodic boundary.
-         * Double charge groups cause trouble with the global indices.
-         */
-        if (d < ddbox->npbcdim &&
-            dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
-        {
-            char error_string[STRLEN];
-
-            sprintf(error_string,
-                    "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
-                    dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
-                    comm->cutoff,
-                    dd->nc[d], dd->nc[d],
-                    dd->nnodes > dd->nc[d] ? "cells" : "ranks");
-
-            if (setmode == setcellsizeslbLOCAL)
-            {
-                gmx_fatal_collective(FARGS, dd->mpi_comm_all, DDMASTER(dd),
-                                     error_string);
-            }
-            else
-            {
-                gmx_fatal(FARGS, error_string);
-            }
-        }
-    }
-
-    if (!isDlbOn(comm))
-    {
-        copy_rvec(cellsize_min, comm->cellsize_min);
-    }
-
-    for (d = 0; d < comm->npmedecompdim; d++)
-    {
-        set_pme_maxshift(dd, &comm->ddpme[d],
-                         comm->slb_frac[dd->dim[d]] == nullptr, ddbox,
-                         comm->ddpme[d].slb_dim_f);
-    }
-}
-
-
-static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
-                                                  int d, int dim, domdec_root_t *root,
-                                                  const gmx_ddbox_t *ddbox,
-                                                  gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
-{
-    gmx_domdec_comm_t *comm;
-    int                ncd, i, j, nmin, nmin_old;
-    gmx_bool           bLimLo, bLimHi;
-    real              *cell_size;
-    real               fac, halfway, cellsize_limit_f_i, region_size;
-    gmx_bool           bPBC, bLastHi = FALSE;
-    int                nrange[] = {range[0], range[1]};
-
-    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
-
-    comm = dd->comm;
-
-    ncd = dd->nc[dim];
-
-    bPBC = (dim < ddbox->npbcdim);
-
-    cell_size = root->buf_ncd;
-
-    if (debug)
-    {
-        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
-    }
-
-    /* First we need to check if the scaling does not make cells
-     * smaller than the smallest allowed size.
-     * We need to do this iteratively, since if a cell is too small,
-     * it needs to be enlarged, which makes all the other cells smaller,
-     * which could in turn make another cell smaller than allowed.
-     */
-    for (i = range[0]; i < range[1]; i++)
-    {
-        root->bCellMin[i] = FALSE;
-    }
-    nmin = 0;
-    do
-    {
-        nmin_old = nmin;
-        /* We need the total for normalization */
-        fac = 0;
-        for (i = range[0]; i < range[1]; i++)
-        {
-            if (root->bCellMin[i] == FALSE)
-            {
-                fac += cell_size[i];
-            }
-        }
-        fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
-        /* Determine the cell boundaries */
-        for (i = range[0]; i < range[1]; i++)
-        {
-            if (root->bCellMin[i] == FALSE)
-            {
-                cell_size[i] *= fac;
-                if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
-                {
-                    cellsize_limit_f_i = 0;
-                }
-                else
-                {
-                    cellsize_limit_f_i = cellsize_limit_f;
-                }
-                if (cell_size[i] < cellsize_limit_f_i)
-                {
-                    root->bCellMin[i] = TRUE;
-                    cell_size[i]      = cellsize_limit_f_i;
-                    nmin++;
-                }
-            }
-            root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
-        }
-    }
-    while (nmin > nmin_old);
-
-    i            = range[1]-1;
-    cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
-    /* For this check we should not use DD_CELL_MARGIN,
-     * but a slightly smaller factor,
-     * since rounding could get use below the limit.
-     */
-    if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
-    {
-        char buf[22];
-        gmx_fatal(FARGS, "step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
-                  gmx_step_str(step, buf),
-                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
-                  ncd, comm->cellsize_min[dim]);
-    }
-
-    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
-
-    if (!bUniform)
-    {
-        /* Check if the boundary did not displace more than halfway
-         * each of the cells it bounds, as this could cause problems,
-         * especially when the differences between cell sizes are large.
-         * If changes are applied, they will not make cells smaller
-         * than the cut-off, as we check all the boundaries which
-         * might be affected by a change and if the old state was ok,
-         * the cells will at most be shrunk back to their old size.
-         */
-        for (i = range[0]+1; i < range[1]; i++)
-        {
-            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
-            if (root->cell_f[i] < halfway)
-            {
-                root->cell_f[i] = halfway;
-                /* Check if the change also causes shifts of the next boundaries */
-                for (j = i+1; j < range[1]; j++)
-                {
-                    if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
-                    {
-                        root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
-                    }
-                }
-            }
-            halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
-            if (root->cell_f[i] > halfway)
-            {
-                root->cell_f[i] = halfway;
-                /* Check if the change also causes shifts of the next boundaries */
-                for (j = i-1; j >= range[0]+1; j--)
-                {
-                    if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
-                    {
-                        root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
-                    }
-                }
-            }
-        }
-    }
-
-    /* nrange is defined as [lower, upper) range for new call to enforce_limits */
-    /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
-     * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
-     * for a and b nrange is used */
-    if (d > 0)
-    {
-        /* Take care of the staggering of the cell boundaries */
-        if (bUniform)
-        {
-            for (i = range[0]; i < range[1]; i++)
-            {
-                root->cell_f_max0[i] = root->cell_f[i];
-                root->cell_f_min1[i] = root->cell_f[i+1];
-            }
-        }
-        else
-        {
-            for (i = range[0]+1; i < range[1]; i++)
-            {
-                bLimLo = (root->cell_f[i] < root->bound_min[i]);
-                bLimHi = (root->cell_f[i] > root->bound_max[i]);
-                if (bLimLo && bLimHi)
-                {
-                    /* Both limits violated, try the best we can */
-                    /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
-                    root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
-                    nrange[0]       = range[0];
-                    nrange[1]       = i;
-                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-
-                    nrange[0] = i;
-                    nrange[1] = range[1];
-                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-
-                    return;
-                }
-                else if (bLimLo)
-                {
-                    /* root->cell_f[i] = root->bound_min[i]; */
-                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
-                    bLastHi   = FALSE;
-                }
-                else if (bLimHi && !bLastHi)
-                {
-                    bLastHi = TRUE;
-                    if (nrange[1] < range[1])   /* found a LimLo before */
-                    {
-                        root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
-                        dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-                        nrange[0] = nrange[1];
-                    }
-                    root->cell_f[i] = root->bound_max[i];
-                    nrange[1]       = i;
-                    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-                    nrange[0] = i;
-                    nrange[1] = range[1];
-                }
-            }
-            if (nrange[1] < range[1])   /* found last a LimLo */
-            {
-                root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
-                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-                nrange[0] = nrange[1];
-                nrange[1] = range[1];
-                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-            }
-            else if (nrange[0] > range[0]) /* found at least one LimHi */
-            {
-                dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-            }
-        }
-    }
-}
-
-
-static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
-                                       int d, int dim, domdec_root_t *root,
-                                       const gmx_ddbox_t *ddbox,
-                                       gmx_bool bDynamicBox,
-                                       gmx_bool bUniform, gmx_int64_t step)
-{
-    gmx_domdec_comm_t *comm;
-    int                ncd, d1, i, pos;
-    real              *cell_size;
-    real               load_aver, load_i, imbalance, change, change_max, sc;
-    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
-    real               change_limit;
-    real               relax = 0.5;
-    gmx_bool           bPBC;
-    int                range[] = { 0, 0 };
-
-    comm = dd->comm;
-
-    /* Convert the maximum change from the input percentage to a fraction */
-    change_limit = comm->dlb_scale_lim*0.01;
-
-    ncd = dd->nc[dim];
-
-    bPBC = (dim < ddbox->npbcdim);
-
-    cell_size = root->buf_ncd;
-
-    /* Store the original boundaries */
-    for (i = 0; i < ncd+1; i++)
-    {
-        root->old_cell_f[i] = root->cell_f[i];
-    }
-    if (bUniform)
-    {
-        for (i = 0; i < ncd; i++)
-        {
-            cell_size[i] = 1.0/ncd;
-        }
-    }
-    else if (dd_load_count(comm) > 0)
-    {
-        load_aver  = comm->load[d].sum_m/ncd;
-        change_max = 0;
-        for (i = 0; i < ncd; i++)
-        {
-            /* Determine the relative imbalance of cell i */
-            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
-            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
-            /* Determine the change of the cell size using underrelaxation */
-            change     = -relax*imbalance;
-            change_max = std::max(change_max, std::max(change, -change));
-        }
-        /* Limit the amount of scaling.
-         * We need to use the same rescaling for all cells in one row,
-         * otherwise the load balancing might not converge.
-         */
-        sc = relax;
-        if (change_max > change_limit)
-        {
-            sc *= change_limit/change_max;
-        }
-        for (i = 0; i < ncd; i++)
-        {
-            /* Determine the relative imbalance of cell i */
-            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
-            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
-            /* Determine the change of the cell size using underrelaxation */
-            change       = -sc*imbalance;
-            cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
-        }
-    }
-
-    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
-    cellsize_limit_f *= DD_CELL_MARGIN;
-    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
-    dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
-    if (ddbox->tric_dir[dim])
-    {
-        cellsize_limit_f /= ddbox->skew_fac[dim];
-        dist_min_f       /= ddbox->skew_fac[dim];
-    }
-    if (bDynamicBox && d > 0)
-    {
-        dist_min_f *= DD_PRES_SCALE_MARGIN;
-    }
-    if (d > 0 && !bUniform)
-    {
-        /* Make sure that the grid is not shifted too much */
-        for (i = 1; i < ncd; i++)
-        {
-            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
-            {
-                gmx_incons("Inconsistent DD boundary staggering limits!");
-            }
-            root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
-            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
-            if (space > 0)
-            {
-                root->bound_min[i] += 0.5*space;
-            }
-            root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
-            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
-            if (space < 0)
-            {
-                root->bound_max[i] += 0.5*space;
-            }
-            if (debug)
-            {
-                fprintf(debug,
-                        "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
-                        d, i,
-                        root->cell_f_max0[i-1] + dist_min_f,
-                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
-                        root->cell_f_min1[i] - dist_min_f);
-            }
-        }
-    }
-    range[1]          = ncd;
-    root->cell_f[0]   = 0;
-    root->cell_f[ncd] = 1;
-    dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
-
-
-    /* After the checks above, the cells should obey the cut-off
-     * restrictions, but it does not hurt to check.
-     */
-    for (i = 0; i < ncd; i++)
-    {
-        if (debug)
-        {
-            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
-                    dim, i, root->cell_f[i], root->cell_f[i+1]);
-        }
-
-        if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
-            root->cell_f[i+1] - root->cell_f[i] <
-            cellsize_limit_f/DD_CELL_MARGIN)
-        {
-            char buf[22];
-            fprintf(stderr,
-                    "\nWARNING step %s: direction %c, cell %d too small: %f\n",
-                    gmx_step_str(step, buf), dim2char(dim), i,
-                    (root->cell_f[i+1] - root->cell_f[i])
-                    *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
-        }
-    }
-
-    pos = ncd + 1;
-    /* Store the cell boundaries of the lower dimensions at the end */
-    for (d1 = 0; d1 < d; d1++)
-    {
-        root->cell_f[pos++] = comm->cell_f0[d1];
-        root->cell_f[pos++] = comm->cell_f1[d1];
-    }
-
-    if (d < comm->npmedecompdim)
-    {
-        /* The master determines the maximum shift for
-         * the coordinate communication between separate PME nodes.
-         */
-        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
-    }
-    root->cell_f[pos++] = comm->ddpme[0].maxshift;
-    if (d >= 1)
-    {
-        root->cell_f[pos++] = comm->ddpme[1].maxshift;
-    }
-}
-
-static void relative_to_absolute_cell_bounds(gmx_domdec_t      *dd,
-                                             const gmx_ddbox_t *ddbox,
-                                             int                dimind)
-{
-    gmx_domdec_comm_t *comm;
-    int                dim;
-
-    comm = dd->comm;
-
-    /* Set the cell dimensions */
-    dim                = dd->dim[dimind];
-    comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
-    comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
-    if (dim >= ddbox->nboundeddim)
-    {
-        comm->cell_x0[dim] += ddbox->box0[dim];
-        comm->cell_x1[dim] += ddbox->box0[dim];
-    }
-}
-
-static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
-                                         int d, int dim, real *cell_f_row,
-                                         const gmx_ddbox_t *ddbox)
-{
-    gmx_domdec_comm_t *comm;
-    int                d1, pos;
-
-    comm = dd->comm;
-
-#if GMX_MPI
-    /* Each node would only need to know two fractions,
-     * but it is probably cheaper to broadcast the whole array.
-     */
-    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
-              0, comm->mpi_comm_load[d]);
-#endif
-    /* Copy the fractions for this dimension from the buffer */
-    comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
-    comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
-    /* The whole array was communicated, so set the buffer position */
-    pos = dd->nc[dim] + 1;
-    for (d1 = 0; d1 <= d; d1++)
-    {
-        if (d1 < d)
-        {
-            /* Copy the cell fractions of the lower dimensions */
-            comm->cell_f0[d1] = cell_f_row[pos++];
-            comm->cell_f1[d1] = cell_f_row[pos++];
-        }
-        relative_to_absolute_cell_bounds(dd, ddbox, d1);
-    }
-    /* Convert the communicated shift from float to int */
-    comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
-    if (d >= 1)
-    {
-        comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
-    }
-}
-
-static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
-                                         const gmx_ddbox_t *ddbox,
-                                         gmx_bool bDynamicBox,
-                                         gmx_bool bUniform, gmx_int64_t step)
-{
-    gmx_domdec_comm_t *comm;
-    int                d, dim, d1;
-    gmx_bool           bRowMember, bRowRoot;
-    real              *cell_f_row;
-
-    comm = dd->comm;
-
-    for (d = 0; d < dd->ndim; d++)
-    {
-        dim        = dd->dim[d];
-        bRowMember = TRUE;
-        bRowRoot   = TRUE;
-        for (d1 = d; d1 < dd->ndim; d1++)
-        {
-            if (dd->ci[dd->dim[d1]] > 0)
-            {
-                if (d1 != d)
-                {
-                    bRowMember = FALSE;
-                }
-                bRowRoot = FALSE;
-            }
-        }
-        if (bRowMember)
-        {
-            if (bRowRoot)
-            {
-                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
-                                           ddbox, bDynamicBox, bUniform, step);
-                cell_f_row = comm->root[d]->cell_f;
-            }
-            else
-            {
-                cell_f_row = comm->cell_f_row;
-            }
-            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
-        }
-    }
-}
-
-static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t      *dd,
-                                           const gmx_ddbox_t *ddbox)
-{
-    int d;
-
-    /* This function assumes the box is static and should therefore
-     * not be called when the box has changed since the last
-     * call to dd_partition_system.
-     */
-    for (d = 0; d < dd->ndim; d++)
-    {
-        relative_to_absolute_cell_bounds(dd, ddbox, d);
-    }
-}
-
-
-
-static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
-                                  const gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
-                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
-                                  gmx_wallcycle_t wcycle)
-{
-    gmx_domdec_comm_t *comm;
-    int                dim;
-
-    comm = dd->comm;
-
-    if (bDoDLB)
-    {
-        wallcycle_start(wcycle, ewcDDCOMMBOUND);
-        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
-        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
-    }
-    else if (bDynamicBox)
-    {
-        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
-    }
-
-    /* Set the dimensions for which no DD is used */
-    for (dim = 0; dim < DIM; dim++)
-    {
-        if (dd->nc[dim] == 1)
-        {
-            comm->cell_x0[dim] = 0;
-            comm->cell_x1[dim] = ddbox->box_size[dim];
-            if (dim >= ddbox->nboundeddim)
-            {
-                comm->cell_x0[dim] += ddbox->box0[dim];
-                comm->cell_x1[dim] += ddbox->box0[dim];
-            }
-        }
-    }
-}
-
-static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
-{
-    int                    d, np, i;
-    gmx_domdec_comm_dim_t *cd;
-
-    for (d = 0; d < dd->ndim; d++)
-    {
-        cd = &dd->comm->cd[d];
-        np = npulse[dd->dim[d]];
-        if (np > cd->np_nalloc)
-        {
-            if (debug)
-            {
-                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
-                        dim2char(dd->dim[d]), np);
-            }
-            if (DDMASTER(dd) && cd->np_nalloc > 0)
-            {
-                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
-            }
-            srenew(cd->ind, np);
-            for (i = cd->np_nalloc; i < np; i++)
-            {
-                cd->ind[i].index  = nullptr;
-                cd->ind[i].nalloc = 0;
-            }
-            cd->np_nalloc = np;
-        }
-        cd->np = np;
-    }
-}
-
-
-static void set_dd_cell_sizes(gmx_domdec_t *dd,
-                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
-                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
-                              gmx_wallcycle_t wcycle)
-{
-    gmx_domdec_comm_t *comm;
-    int                d;
-    ivec               npulse;
-
-    comm = dd->comm;
-
-    /* Copy the old cell boundaries for the cg displacement check */
-    copy_rvec(comm->cell_x0, comm->old_cell_x0);
-    copy_rvec(comm->cell_x1, comm->old_cell_x1);
-
-    if (isDlbOn(comm))
-    {
-        if (DDMASTER(dd))
-        {
-            check_box_size(dd, ddbox);
-        }
-        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
-    }
-    else
-    {
-        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
-        realloc_comm_ind(dd, npulse);
-    }
-
-    if (debug)
-    {
-        for (d = 0; d < DIM; d++)
-        {
-            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
-                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
-        }
-    }
-}
-
-static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
-                                  gmx_ddbox_t *ddbox,
-                                  rvec cell_ns_x0, rvec cell_ns_x1,
-                                  gmx_int64_t step)
-{
-    gmx_domdec_comm_t *comm;
-    int                dim_ind, dim;
-
-    comm = dd->comm;
-
-    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
-    {
-        dim = dd->dim[dim_ind];
-
-        /* Without PBC we don't have restrictions on the outer cells */
-        if (!(dim >= ddbox->npbcdim &&
-              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
-            isDlbOn(comm) &&
-            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
-            comm->cellsize_min[dim])
-        {
-            char buf[22];
-            gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
-                      gmx_step_str(step, buf), dim2char(dim),
-                      comm->cell_x1[dim] - comm->cell_x0[dim],
-                      ddbox->skew_fac[dim],
-                      dd->comm->cellsize_min[dim],
-                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
-        }
-    }
-
-    if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
-    {
-        /* Communicate the boundaries and update cell_ns_x0/1 */
-        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
-        if (isDlbOn(dd->comm) && dd->ndim > 1)
-        {
-            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
-        }
-    }
-}
-
-static void make_tric_corr_matrix(int npbcdim, const matrix box, matrix tcm)
-{
-    if (YY < npbcdim)
-    {
-        tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
-    }
-    else
-    {
-        tcm[YY][XX] = 0;
-    }
-    if (ZZ < npbcdim)
-    {
-        tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
-        tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
-    }
-    else
-    {
-        tcm[ZZ][XX] = 0;
-        tcm[ZZ][YY] = 0;
-    }
-}
-
-static void check_screw_box(const matrix box)
-{
-    /* Mathematical limitation */
-    if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
-    {
-        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
-    }
-
-    /* Limitation due to the asymmetry of the eighth shell method */
-    if (box[ZZ][YY] != 0)
-    {
-        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
-    }
-}
-
-static void distribute_cg(FILE *fplog,
-                          const matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
-                          gmx_domdec_t *dd)
-{
-    gmx_domdec_master_t *ma;
-    int                **tmp_ind = nullptr, *tmp_nalloc = nullptr;
-    int                  i, icg, j, k, k0, k1, d;
-    matrix               tcm;
-    rvec                 cg_cm;
-    ivec                 ind;
-    real                 nrcg, inv_ncg, pos_d;
-    int                 *cgindex;
-    gmx_bool             bScrew;
-
-    ma = dd->ma;
-
-    snew(tmp_nalloc, dd->nnodes);
-    snew(tmp_ind, dd->nnodes);
-    for (i = 0; i < dd->nnodes; i++)
-    {
-        tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
-        snew(tmp_ind[i], tmp_nalloc[i]);
-    }
-
-    /* Clear the count */
-    for (i = 0; i < dd->nnodes; i++)
-    {
-        ma->ncg[i] = 0;
-        ma->nat[i] = 0;
-    }
-
-    make_tric_corr_matrix(dd->npbcdim, box, tcm);
-
-    cgindex = cgs->index;
-
-    /* Compute the center of geometry for all charge groups */
-    for (icg = 0; icg < cgs->nr; icg++)
-    {
-        k0      = cgindex[icg];
-        k1      = cgindex[icg+1];
-        nrcg    = k1 - k0;
-        if (nrcg == 1)
-        {
-            copy_rvec(pos[k0], cg_cm);
-        }
-        else
-        {
-            inv_ncg = 1.0/nrcg;
-
-            clear_rvec(cg_cm);
-            for (k = k0; (k < k1); k++)
-            {
-                rvec_inc(cg_cm, pos[k]);
-            }
-            for (d = 0; (d < DIM); d++)
-            {
-                cg_cm[d] *= inv_ncg;
-            }
-        }
-        /* Put the charge group in the box and determine the cell index */
-        for (d = DIM-1; d >= 0; d--)
-        {
-            pos_d = cg_cm[d];
-            if (d < dd->npbcdim)
-            {
-                bScrew = (dd->bScrewPBC && d == XX);
-                if (tric_dir[d] && dd->nc[d] > 1)
-                {
-                    /* Use triclinic coordintates for this dimension */
-                    for (j = d+1; j < DIM; j++)
-                    {
-                        pos_d += cg_cm[j]*tcm[j][d];
-                    }
-                }
-                while (pos_d >= box[d][d])
-                {
-                    pos_d -= box[d][d];
-                    rvec_dec(cg_cm, box[d]);
-                    if (bScrew)
-                    {
-                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
-                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
-                    }
-                    for (k = k0; (k < k1); k++)
-                    {
-                        rvec_dec(pos[k], box[d]);
-                        if (bScrew)
-                        {
-                            pos[k][YY] = box[YY][YY] - pos[k][YY];
-                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
-                        }
-                    }
-                }
-                while (pos_d < 0)
-                {
-                    pos_d += box[d][d];
-                    rvec_inc(cg_cm, box[d]);
-                    if (bScrew)
-                    {
-                        cg_cm[YY] = box[YY][YY] - cg_cm[YY];
-                        cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
-                    }
-                    for (k = k0; (k < k1); k++)
-                    {
-                        rvec_inc(pos[k], box[d]);
-                        if (bScrew)
-                        {
-                            pos[k][YY] = box[YY][YY] - pos[k][YY];
-                            pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
-                        }
-                    }
-                }
-            }
-            /* This could be done more efficiently */
-            ind[d] = 0;
-            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
-            {
-                ind[d]++;
-            }
-        }
-        i = dd_index(dd->nc, ind);
-        if (ma->ncg[i] == tmp_nalloc[i])
-        {
-            tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
-            srenew(tmp_ind[i], tmp_nalloc[i]);
-        }
-        tmp_ind[i][ma->ncg[i]] = icg;
-        ma->ncg[i]++;
-        ma->nat[i] += cgindex[icg+1] - cgindex[icg];
-    }
-
-    k1 = 0;
-    for (i = 0; i < dd->nnodes; i++)
-    {
-        ma->index[i] = k1;
-        for (k = 0; k < ma->ncg[i]; k++)
-        {
-            ma->cg[k1++] = tmp_ind[i][k];
-        }
-    }
-    ma->index[dd->nnodes] = k1;
-
-    for (i = 0; i < dd->nnodes; i++)
-    {
-        sfree(tmp_ind[i]);
-    }
-    sfree(tmp_ind);
-    sfree(tmp_nalloc);
-
-    if (fplog)
-    {
-        // Use double for the sums to avoid natoms^2 overflowing
-        // (65537^2 > 2^32)
-        int    nat_sum, nat_min, nat_max;
-        double nat2_sum;
-
-        nat_sum  = 0;
-        nat2_sum = 0;
-        nat_min  = ma->nat[0];
-        nat_max  = ma->nat[0];
-        for (i = 0; i < dd->nnodes; i++)
-        {
-            nat_sum  += ma->nat[i];
-            // cast to double to avoid integer overflows when squaring
-            nat2_sum += gmx::square(static_cast<double>(ma->nat[i]));
-            nat_min   = std::min(nat_min, ma->nat[i]);
-            nat_max   = std::max(nat_max, ma->nat[i]);
-        }
-        nat_sum  /= dd->nnodes;
-        nat2_sum /= dd->nnodes;
-
-        fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
-                dd->nnodes,
-                nat_sum,
-                static_cast<int>(std::sqrt(nat2_sum - gmx::square(static_cast<double>(nat_sum)) + 0.5)),
-                nat_min, nat_max);
-    }
-}
-
-static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
-                                t_block *cgs, const matrix box, gmx_ddbox_t *ddbox,
-                                rvec pos[])
-{
-    gmx_domdec_master_t *ma = nullptr;
-    ivec                 npulse;
-    int                  i, cg_gl;
-    int                 *ibuf, buf2[2] = { 0, 0 };
-    gmx_bool             bMaster = DDMASTER(dd);
-
-    if (bMaster)
-    {
-        ma = dd->ma;
-
-        if (dd->bScrewPBC)
-        {
-            check_screw_box(box);
-        }
-
-        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
-
-        distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
-        for (i = 0; i < dd->nnodes; i++)
-        {
-            ma->ibuf[2*i]   = ma->ncg[i];
-            ma->ibuf[2*i+1] = ma->nat[i];
-        }
-        ibuf = ma->ibuf;
-    }
-    else
-    {
-        ibuf = nullptr;
-    }
-    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
-
-    dd->ncg_home = buf2[0];
-    dd->nat_home = buf2[1];
-    dd->ncg_tot  = dd->ncg_home;
-    dd->nat_tot  = dd->nat_home;
-    if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
-    {
-        dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
-        srenew(dd->index_gl, dd->cg_nalloc);
-        srenew(dd->cgindex, dd->cg_nalloc+1);
-    }
-    if (bMaster)
-    {
-        for (i = 0; i < dd->nnodes; i++)
-        {
-            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
-            ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
-        }
-    }
-
-    dd_scatterv(dd,
-                bMaster ? ma->ibuf : nullptr,
-                bMaster ? ma->ibuf+dd->nnodes : nullptr,
-                bMaster ? ma->cg : nullptr,
-                dd->ncg_home*sizeof(int), dd->index_gl);
-
-    /* Determine the home charge group sizes */
-    dd->cgindex[0] = 0;
-    for (i = 0; i < dd->ncg_home; i++)
-    {
-        cg_gl            = dd->index_gl[i];
-        dd->cgindex[i+1] =
-            dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
-    }
-
-    if (debug)
-    {
-        fprintf(debug, "Home charge groups:\n");
-        for (i = 0; i < dd->ncg_home; i++)
-        {
-            fprintf(debug, " %d", dd->index_gl[i]);
-            if (i % 10 == 9)
-            {
-                fprintf(debug, "\n");
-            }
-        }
-        fprintf(debug, "\n");
-    }
-}
-
-static int compact_and_copy_vec_at(int ncg, int *move,
-                                   int *cgindex,
-                                   int nvec, int vec,
-                                   rvec *src, gmx_domdec_comm_t *comm,
-                                   gmx_bool bCompact)
-{
-    int m, icg, i, i0, i1, nrcg;
-    int home_pos;
-    int pos_vec[DIM*2];
-
-    home_pos = 0;
-
-    for (m = 0; m < DIM*2; m++)
-    {
-        pos_vec[m] = 0;
-    }
-
-    i0 = 0;
-    for (icg = 0; icg < ncg; icg++)
-    {
-        i1 = cgindex[icg+1];
-        m  = move[icg];
-        if (m == -1)
-        {
-            if (bCompact)
-            {
-                /* Compact the home array in place */
-                for (i = i0; i < i1; i++)
-                {
-                    copy_rvec(src[i], src[home_pos++]);
-                }
-            }
-        }
-        else
-        {
-            /* Copy to the communication buffer */
-            nrcg        = i1 - i0;
-            pos_vec[m] += 1 + vec*nrcg;
-            for (i = i0; i < i1; i++)
-            {
-                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
-            }
-            pos_vec[m] += (nvec - vec - 1)*nrcg;
-        }
-        if (!bCompact)
-        {
-            home_pos += i1 - i0;
-        }
-        i0 = i1;
-    }
-
-    return home_pos;
-}
-
-static int compact_and_copy_vec_cg(int ncg, int *move,
-                                   int *cgindex,
-                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
-                                   gmx_bool bCompact)
-{
-    int m, icg, i0, i1, nrcg;
-    int home_pos;
-    int pos_vec[DIM*2];
-
-    home_pos = 0;
-
-    for (m = 0; m < DIM*2; m++)
-    {
-        pos_vec[m] = 0;
-    }
-
-    i0 = 0;
-    for (icg = 0; icg < ncg; icg++)
-    {
-        i1 = cgindex[icg+1];
-        m  = move[icg];
-        if (m == -1)
-        {
-            if (bCompact)
-            {
-                /* Compact the home array in place */
-                copy_rvec(src[icg], src[home_pos++]);
-            }
-        }
-        else
-        {
-            nrcg = i1 - i0;
-            /* Copy to the communication buffer */
-            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
-            pos_vec[m] += 1 + nrcg*nvec;
-        }
-        i0 = i1;
-    }
-    if (!bCompact)
-    {
-        home_pos = ncg;
-    }
-
-    return home_pos;
-}
-
-static int compact_ind(int ncg, int *move,
-                       int *index_gl, int *cgindex,
-                       int *gatindex,
-                       gmx_ga2la_t *ga2la, char *bLocalCG,
-                       int *cginfo)
-{
-    int cg, nat, a0, a1, a, a_gl;
-    int home_pos;
-
-    home_pos = 0;
-    nat      = 0;
-    for (cg = 0; cg < ncg; cg++)
-    {
-        a0 = cgindex[cg];
-        a1 = cgindex[cg+1];
-        if (move[cg] == -1)
-        {
-            /* Compact the home arrays in place.
-             * Anything that can be done here avoids access to global arrays.
-             */
-            cgindex[home_pos] = nat;
-            for (a = a0; a < a1; a++)
-            {
-                a_gl          = gatindex[a];
-                gatindex[nat] = a_gl;
-                /* The cell number stays 0, so we don't need to set it */
-                ga2la_change_la(ga2la, a_gl, nat);
-                nat++;
-            }
-            index_gl[home_pos] = index_gl[cg];
-            cginfo[home_pos]   = cginfo[cg];
-            /* The charge group remains local, so bLocalCG does not change */
-            home_pos++;
-        }
-        else
-        {
-            /* Clear the global indices */
-            for (a = a0; a < a1; a++)
-            {
-                ga2la_del(ga2la, gatindex[a]);
-            }
-            if (bLocalCG)
-            {
-                bLocalCG[index_gl[cg]] = FALSE;
-            }
-        }
-    }
-    cgindex[home_pos] = nat;
-
-    return home_pos;
-}
-
-static void clear_and_mark_ind(int ncg, int *move,
-                               int *index_gl, int *cgindex, int *gatindex,
-                               gmx_ga2la_t *ga2la, char *bLocalCG,
-                               int *cell_index)
-{
-    int cg, a0, a1, a;
-
-    for (cg = 0; cg < ncg; cg++)
-    {
-        if (move[cg] >= 0)
-        {
-            a0 = cgindex[cg];
-            a1 = cgindex[cg+1];
-            /* Clear the global indices */
-            for (a = a0; a < a1; a++)
-            {
-                ga2la_del(ga2la, gatindex[a]);
-            }
-            if (bLocalCG)
-            {
-                bLocalCG[index_gl[cg]] = FALSE;
-            }
-            /* Signal that this cg has moved using the ns cell index.
-             * Here we set it to -1. fill_grid will change it
-             * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
-             */
-            cell_index[cg] = -1;
-        }
-    }
-}
-
-static void print_cg_move(FILE *fplog,
-                          gmx_domdec_t *dd,
-                          gmx_int64_t step, int cg, int dim, int dir,
-                          gmx_bool bHaveCgcmOld, real limitd,
-                          rvec cm_old, rvec cm_new, real pos_d)
-{
-    gmx_domdec_comm_t *comm;
-    char               buf[22];
-
-    comm = dd->comm;
-
-    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
-    if (limitd > 0)
-    {
-        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
-                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
-                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
-    }
-    else
-    {
-        /* We don't have a limiting distance available: don't print it */
-        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
-                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
-                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
-    }
-    fprintf(fplog, "distance out of cell %f\n",
-            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
-    if (bHaveCgcmOld)
-    {
-        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
-                cm_old[XX], cm_old[YY], cm_old[ZZ]);
-    }
-    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
-            cm_new[XX], cm_new[YY], cm_new[ZZ]);
-    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
-            dim2char(dim),
-            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
-    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
-            dim2char(dim),
-            comm->cell_x0[dim], comm->cell_x1[dim]);
-}
-
-static void cg_move_error(FILE *fplog,
-                          gmx_domdec_t *dd,
-                          gmx_int64_t step, int cg, int dim, int dir,
-                          gmx_bool bHaveCgcmOld, real limitd,
-                          rvec cm_old, rvec cm_new, real pos_d)
-{
-    if (fplog)
-    {
-        print_cg_move(fplog, dd, step, cg, dim, dir,
-                      bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
-    }
-    print_cg_move(stderr, dd, step, cg, dim, dir,
-                  bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
-    gmx_fatal(FARGS,
-              "%s moved too far between two domain decomposition steps\n"
-              "This usually means that your system is not well equilibrated",
-              dd->comm->bCGs ? "A charge group" : "An atom");
-}
-
-static void rotate_state_atom(t_state *state, int a)
-{
-    if (state->flags & (1 << estX))
-    {
-        /* Rotate the complete state; for a rectangular box only */
-        state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
-        state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
-    }
-    if (state->flags & (1 << estV))
-    {
-        state->v[a][YY] = -state->v[a][YY];
-        state->v[a][ZZ] = -state->v[a][ZZ];
-    }
-    if (state->flags & (1 << estCGP))
-    {
-        state->cg_p[a][YY] = -state->cg_p[a][YY];
-        state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
-    }
-}
-
-static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
-{
-    if (natoms > comm->moved_nalloc)
-    {
-        /* Contents should be preserved here */
-        comm->moved_nalloc = over_alloc_dd(natoms);
-        srenew(comm->moved, comm->moved_nalloc);
-    }
-
-    return comm->moved;
-}
-
-static void calc_cg_move(FILE *fplog, gmx_int64_t step,
-                         gmx_domdec_t *dd,
-                         t_state *state,
-                         ivec tric_dir, matrix tcm,
-                         rvec cell_x0, rvec cell_x1,
-                         rvec limitd, rvec limit0, rvec limit1,
-                         const int *cgindex,
-                         int cg_start, int cg_end,
-                         rvec *cg_cm,
-                         int *move)
-{
-    int      npbcdim;
-    int      cg, k, k0, k1, d, dim, d2;
-    int      mc, nrcg;
-    int      flag;
-    gmx_bool bScrew;
-    ivec     dev;
-    real     inv_ncg, pos_d;
-    rvec     cm_new;
-
-    npbcdim = dd->npbcdim;
-
-    for (cg = cg_start; cg < cg_end; cg++)
-    {
-        k0   = cgindex[cg];
-        k1   = cgindex[cg+1];
-        nrcg = k1 - k0;
-        if (nrcg == 1)
-        {
-            copy_rvec(state->x[k0], cm_new);
-        }
-        else
-        {
-            inv_ncg = 1.0/nrcg;
-
-            clear_rvec(cm_new);
-            for (k = k0; (k < k1); k++)
-            {
-                rvec_inc(cm_new, state->x[k]);
-            }
-            for (d = 0; (d < DIM); d++)
-            {
-                cm_new[d] = inv_ncg*cm_new[d];
-            }
-        }
-
-        clear_ivec(dev);
-        /* Do pbc and check DD cell boundary crossings */
-        for (d = DIM-1; d >= 0; d--)
-        {
-            if (dd->nc[d] > 1)
-            {
-                bScrew = (dd->bScrewPBC && d == XX);
-                /* Determine the location of this cg in lattice coordinates */
-                pos_d = cm_new[d];
-                if (tric_dir[d])
-                {
-                    for (d2 = d+1; d2 < DIM; d2++)
-                    {
-                        pos_d += cm_new[d2]*tcm[d2][d];
-                    }
-                }
-                /* Put the charge group in the triclinic unit-cell */
-                if (pos_d >= cell_x1[d])
-                {
-                    if (pos_d >= limit1[d])
-                    {
-                        cg_move_error(fplog, dd, step, cg, d, 1,
-                                      cg_cm != as_rvec_array(state->x.data()), limitd[d],
-                                      cg_cm[cg], cm_new, pos_d);
-                    }
-                    dev[d] = 1;
-                    if (dd->ci[d] == dd->nc[d] - 1)
-                    {
-                        rvec_dec(cm_new, state->box[d]);
-                        if (bScrew)
-                        {
-                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
-                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
-                        }
-                        for (k = k0; (k < k1); k++)
-                        {
-                            rvec_dec(state->x[k], state->box[d]);
-                            if (bScrew)
-                            {
-                                rotate_state_atom(state, k);
-                            }
-                        }
-                    }
-                }
-                else if (pos_d < cell_x0[d])
-                {
-                    if (pos_d < limit0[d])
-                    {
-                        cg_move_error(fplog, dd, step, cg, d, -1,
-                                      cg_cm != as_rvec_array(state->x.data()), limitd[d],
-                                      cg_cm[cg], cm_new, pos_d);
-                    }
-                    dev[d] = -1;
-                    if (dd->ci[d] == 0)
-                    {
-                        rvec_inc(cm_new, state->box[d]);
-                        if (bScrew)
-                        {
-                            cm_new[YY] = state->box[YY][YY] - cm_new[YY];
-                            cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
-                        }
-                        for (k = k0; (k < k1); k++)
-                        {
-                            rvec_inc(state->x[k], state->box[d]);
-                            if (bScrew)
-                            {
-                                rotate_state_atom(state, k);
-                            }
-                        }
-                    }
-                }
-            }
-            else if (d < npbcdim)
-            {
-                /* Put the charge group in the rectangular unit-cell */
-                while (cm_new[d] >= state->box[d][d])
-                {
-                    rvec_dec(cm_new, state->box[d]);
-                    for (k = k0; (k < k1); k++)
-                    {
-                        rvec_dec(state->x[k], state->box[d]);
-                    }
-                }
-                while (cm_new[d] < 0)
-                {
-                    rvec_inc(cm_new, state->box[d]);
-                    for (k = k0; (k < k1); k++)
-                    {
-                        rvec_inc(state->x[k], state->box[d]);
-                    }
-                }
-            }
-        }
-
-        copy_rvec(cm_new, cg_cm[cg]);
-
-        /* Determine where this cg should go */
-        flag = 0;
-        mc   = -1;
-        for (d = 0; d < dd->ndim; d++)
-        {
-            dim = dd->dim[d];
-            if (dev[dim] == 1)
-            {
-                flag |= DD_FLAG_FW(d);
-                if (mc == -1)
-                {
-                    mc = d*2;
-                }
-            }
-            else if (dev[dim] == -1)
-            {
-                flag |= DD_FLAG_BW(d);
-                if (mc == -1)
-                {
-                    if (dd->nc[dim] > 2)
-                    {
-                        mc = d*2 + 1;
-                    }
-                    else
-                    {
-                        mc = d*2;
-                    }
-                }
-            }
-        }
-        /* Temporarily store the flag in move */
-        move[cg] = mc + flag;
-    }
-}
-
-static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
-                               gmx_domdec_t *dd, ivec tric_dir,
-                               t_state *state, PaddedRVecVector *f,
-                               t_forcerec *fr,
-                               gmx_bool bCompact,
-                               t_nrnb *nrnb,
-                               int *ncg_stay_home,
-                               int *ncg_moved)
-{
-    int               *move;
-    int                npbcdim;
-    int                ncg[DIM*2] = { 0 }, nat[DIM*2] = { 0 };
-    int                i, cg, k, d, dim, dim2, dir, d2, d3;
-    int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
-    int                sbuf[2], rbuf[2];
-    int                home_pos_cg, home_pos_at, buf_pos;
-    int                flag;
-    real               pos_d;
-    matrix             tcm;
-    rvec              *cg_cm = nullptr, cell_x0, cell_x1, limitd, limit0, limit1;
-    int               *cgindex;
-    cginfo_mb_t       *cginfo_mb;
-    gmx_domdec_comm_t *comm;
-    int               *moved;
-    int                nthread, thread;
-
-    if (dd->bScrewPBC)
-    {
-        check_screw_box(state->box);
-    }
-
-    comm  = dd->comm;
-    if (fr->cutoff_scheme == ecutsGROUP)
-    {
-        cg_cm = fr->cg_cm;
-    }
-
-    // Positions are always present, so there's nothing to flag
-    bool bV   = state->flags & (1<<estV);
-    bool bCGP = state->flags & (1<<estCGP);
-
-    if (dd->ncg_tot > comm->nalloc_int)
-    {
-        comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
-        srenew(comm->buf_int, comm->nalloc_int);
-    }
-    move = comm->buf_int;
-
-    npbcdim = dd->npbcdim;
-
-    for (d = 0; (d < DIM); d++)
-    {
-        limitd[d] = dd->comm->cellsize_min[d];
-        if (d >= npbcdim && dd->ci[d] == 0)
-        {
-            cell_x0[d] = -GMX_FLOAT_MAX;
-        }
-        else
-        {
-            cell_x0[d] = comm->cell_x0[d];
-        }
-        if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
-        {
-            cell_x1[d] = GMX_FLOAT_MAX;
-        }
-        else
-        {
-            cell_x1[d] = comm->cell_x1[d];
-        }
-        if (d < npbcdim)
-        {
-            limit0[d] = comm->old_cell_x0[d] - limitd[d];
-            limit1[d] = comm->old_cell_x1[d] + limitd[d];
-        }
-        else
-        {
-            /* We check after communication if a charge group moved
-             * more than one cell. Set the pre-comm check limit to float_max.
-             */
-            limit0[d] = -GMX_FLOAT_MAX;
-            limit1[d] =  GMX_FLOAT_MAX;
-        }
-    }
-
-    make_tric_corr_matrix(npbcdim, state->box, tcm);
-
-    cgindex = dd->cgindex;
-
-    nthread = gmx_omp_nthreads_get(emntDomdec);
-
-    /* Compute the center of geometry for all home charge groups
-     * and put them in the box and determine where they should go.
-     */
-#pragma omp parallel for num_threads(nthread) schedule(static)
-    for (thread = 0; thread < nthread; thread++)
-    {
-        try
-        {
-            calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
-                         cell_x0, cell_x1, limitd, limit0, limit1,
-                         cgindex,
-                         ( thread   *dd->ncg_home)/nthread,
-                         ((thread+1)*dd->ncg_home)/nthread,
-                         fr->cutoff_scheme == ecutsGROUP ? cg_cm : as_rvec_array(state->x.data()),
-                         move);
-        }
-        GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
-    }
-
-    for (cg = 0; cg < dd->ncg_home; cg++)
-    {
-        if (move[cg] >= 0)
-        {
-            mc       = move[cg];
-            flag     = mc & ~DD_FLAG_NRCG;
-            mc       = mc & DD_FLAG_NRCG;
-            move[cg] = mc;
-
-            if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
-            {
-                comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
-                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
-            }
-            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
-            /* We store the cg size in the lower 16 bits
-             * and the place where the charge group should go
-             * in the next 6 bits. This saves some communication volume.
-             */
-            nrcg = cgindex[cg+1] - cgindex[cg];
-            comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
-            ncg[mc] += 1;
-            nat[mc] += nrcg;
-        }
-    }
-
-    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
-    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
-
-    *ncg_moved = 0;
-    for (i = 0; i < dd->ndim*2; i++)
-    {
-        *ncg_moved += ncg[i];
-    }
-
-    nvec = 1;
-    if (bV)
-    {
-        nvec++;
-    }
-    if (bCGP)
-    {
-        nvec++;
-    }
-
-    /* Make sure the communication buffers are large enough */
-    for (mc = 0; mc < dd->ndim*2; mc++)
-    {
-        nvr = ncg[mc] + nat[mc]*nvec;
-        if (nvr > comm->cgcm_state_nalloc[mc])
-        {
-            comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
-            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
-        }
-    }
-
-    switch (fr->cutoff_scheme)
-    {
-        case ecutsGROUP:
-            /* Recalculating cg_cm might be cheaper than communicating,
-             * but that could give rise to rounding issues.
-             */
-            home_pos_cg =
-                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
-                                        nvec, cg_cm, comm, bCompact);
-            break;
-        case ecutsVERLET:
-            /* Without charge groups we send the moved atom coordinates
-             * over twice. This is so the code below can be used without
-             * many conditionals for both for with and without charge groups.
-             */
-            home_pos_cg =
-                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
-                                        nvec, as_rvec_array(state->x.data()), comm, FALSE);
-            if (bCompact)
-            {
-                home_pos_cg -= *ncg_moved;
-            }
-            break;
-        default:
-            gmx_incons("unimplemented");
-            home_pos_cg = 0;
-    }
-
-    vec         = 0;
-    home_pos_at =
-        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
-                                nvec, vec++, as_rvec_array(state->x.data()),
-                                comm, bCompact);
-    if (bV)
-    {
-        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
-                                nvec, vec++, as_rvec_array(state->v.data()),
-                                comm, bCompact);
-    }
-    if (bCGP)
-    {
-        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
-                                nvec, vec++, as_rvec_array(state->cg_p.data()),
-                                comm, bCompact);
-    }
-
-    if (bCompact)
-    {
-        compact_ind(dd->ncg_home, move,
-                    dd->index_gl, dd->cgindex, dd->gatindex,
-                    dd->ga2la, comm->bLocalCG,
-                    fr->cginfo);
-    }
-    else
-    {
-        if (fr->cutoff_scheme == ecutsVERLET)
-        {
-            moved = get_moved(comm, dd->ncg_home);
-
-            for (k = 0; k < dd->ncg_home; k++)
-            {
-                moved[k] = 0;
-            }
-        }
-        else
-        {
-            moved = fr->ns->grid->cell_index;
-        }
-
-        clear_and_mark_ind(dd->ncg_home, move,
-                           dd->index_gl, dd->cgindex, dd->gatindex,
-                           dd->ga2la, comm->bLocalCG,
-                           moved);
-    }
-
-    cginfo_mb = fr->cginfo_mb;
-
-    *ncg_stay_home = home_pos_cg;
-    for (d = 0; d < dd->ndim; d++)
-    {
-        dim      = dd->dim[d];
-        ncg_recv = 0;
-        nvr      = 0;
-        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
-        {
-            cdd = d*2 + dir;
-            /* Communicate the cg and atom counts */
-            sbuf[0] = ncg[cdd];
-            sbuf[1] = nat[cdd];
-            if (debug)
-            {
-                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
-                        d, dir, sbuf[0], sbuf[1]);
-            }
-            dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
-
-            if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
-            {
-                comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
-                srenew(comm->buf_int, comm->nalloc_int);
-            }
-
-            /* Communicate the charge group indices, sizes and flags */
-            dd_sendrecv_int(dd, d, dir,
-                            comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
-                            comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
-
-            nvs = ncg[cdd] + nat[cdd]*nvec;
-            i   = rbuf[0]  + rbuf[1] *nvec;
-            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
-
-            /* Communicate cgcm and state */
-            dd_sendrecv_rvec(dd, d, dir,
-                             comm->cgcm_state[cdd], nvs,
-                             comm->vbuf.v+nvr, i);
-            ncg_recv += rbuf[0];
-            nvr      += i;
-        }
-
-        dd_check_alloc_ncg(fr, state, f, home_pos_cg + ncg_recv);
-        if (fr->cutoff_scheme == ecutsGROUP)
-        {
-            /* Here we resize to more than necessary and shrink later */
-            dd_resize_state(state, f, home_pos_at + ncg_recv*MAX_CGCGSIZE);
-        }
-
-        /* Process the received charge groups */
-        buf_pos = 0;
-        for (cg = 0; cg < ncg_recv; cg++)
-        {
-            flag = comm->buf_int[cg*DD_CGIBS+1];
-
-            if (dim >= npbcdim && dd->nc[dim] > 2)
-            {
-                /* No pbc in this dim and more than one domain boundary.
-                 * We do a separate check if a charge group didn't move too far.
-                 */
-                if (((flag & DD_FLAG_FW(d)) &&
-                     comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
-                    ((flag & DD_FLAG_BW(d)) &&
-                     comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
-                {
-                    cg_move_error(fplog, dd, step, cg, dim,
-                                  (flag & DD_FLAG_FW(d)) ? 1 : 0,
-                                  fr->cutoff_scheme == ecutsGROUP, 0,
-                                  comm->vbuf.v[buf_pos],
-                                  comm->vbuf.v[buf_pos],
-                                  comm->vbuf.v[buf_pos][dim]);
-                }
-            }
-
-            mc = -1;
-            if (d < dd->ndim-1)
-            {
-                /* Check which direction this cg should go */
-                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
-                {
-                    if (isDlbOn(dd->comm))
-                    {
-                        /* The cell boundaries for dimension d2 are not equal
-                         * for each cell row of the lower dimension(s),
-                         * therefore we might need to redetermine where
-                         * this cg should go.
-                         */
-                        dim2 = dd->dim[d2];
-                        /* If this cg crosses the box boundary in dimension d2
-                         * we can use the communicated flag, so we do not
-                         * have to worry about pbc.
-                         */
-                        if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
-                               (flag & DD_FLAG_FW(d2))) ||
-                              (dd->ci[dim2] == 0 &&
-                               (flag & DD_FLAG_BW(d2)))))
-                        {
-                            /* Clear the two flags for this dimension */
-                            flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
-                            /* Determine the location of this cg
-                             * in lattice coordinates
-                             */
-                            pos_d = comm->vbuf.v[buf_pos][dim2];
-                            if (tric_dir[dim2])
-                            {
-                                for (d3 = dim2+1; d3 < DIM; d3++)
-                                {
-                                    pos_d +=
-                                        comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
-                                }
-                            }
-                            /* Check of we are not at the box edge.
-                             * pbc is only handled in the first step above,
-                             * but this check could move over pbc while
-                             * the first step did not due to different rounding.
-                             */
-                            if (pos_d >= cell_x1[dim2] &&
-                                dd->ci[dim2] != dd->nc[dim2]-1)
-                            {
-                                flag |= DD_FLAG_FW(d2);
-                            }
-                            else if (pos_d < cell_x0[dim2] &&
-                                     dd->ci[dim2] != 0)
-                            {
-                                flag |= DD_FLAG_BW(d2);
-                            }
-                            comm->buf_int[cg*DD_CGIBS+1] = flag;
-                        }
-                    }
-                    /* Set to which neighboring cell this cg should go */
-                    if (flag & DD_FLAG_FW(d2))
-                    {
-                        mc = d2*2;
-                    }
-                    else if (flag & DD_FLAG_BW(d2))
-                    {
-                        if (dd->nc[dd->dim[d2]] > 2)
-                        {
-                            mc = d2*2+1;
-                        }
-                        else
-                        {
-                            mc = d2*2;
-                        }
-                    }
-                }
-            }
-
-            nrcg = flag & DD_FLAG_NRCG;
-            if (mc == -1)
-            {
-                if (home_pos_cg+1 > dd->cg_nalloc)
-                {
-                    dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
-                    srenew(dd->index_gl, dd->cg_nalloc);
-                    srenew(dd->cgindex, dd->cg_nalloc+1);
-                }
-                /* Set the global charge group index and size */
-                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
-                dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
-                /* Copy the state from the buffer */
-                if (fr->cutoff_scheme == ecutsGROUP)
-                {
-                    cg_cm = fr->cg_cm;
-                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
-                }
-                buf_pos++;
-
-                /* Set the cginfo */
-                fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
-                                                   dd->index_gl[home_pos_cg]);
-                if (comm->bLocalCG)
-                {
-                    comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
-                }
-
-                for (i = 0; i < nrcg; i++)
-                {
-                    copy_rvec(comm->vbuf.v[buf_pos++],
-                              state->x[home_pos_at+i]);
-                }
-                if (bV)
-                {
-                    for (i = 0; i < nrcg; i++)
-                    {
-                        copy_rvec(comm->vbuf.v[buf_pos++],
-                                  state->v[home_pos_at+i]);
-                    }
-                }
-                if (bCGP)
-                {
-                    for (i = 0; i < nrcg; i++)
-                    {
-                        copy_rvec(comm->vbuf.v[buf_pos++],
-                                  state->cg_p[home_pos_at+i]);
-                    }
-                }
-                home_pos_cg += 1;
-                home_pos_at += nrcg;
-            }
-            else
-            {
-                /* Reallocate the buffers if necessary  */
-                if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
-                {
-                    comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
-                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
-                }
-                nvr = ncg[mc] + nat[mc]*nvec;
-                if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
-                {
-                    comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
-                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
-                }
-                /* Copy from the receive to the send buffers */
-                memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
-                       comm->buf_int + cg*DD_CGIBS,
-                       DD_CGIBS*sizeof(int));
-                memcpy(comm->cgcm_state[mc][nvr],
-                       comm->vbuf.v[buf_pos],
-                       (1+nrcg*nvec)*sizeof(rvec));
-                buf_pos += 1 + nrcg*nvec;
-                ncg[mc] += 1;
-                nat[mc] += nrcg;
-            }
-        }
-    }
+    gmx_domdec_comm_t *comm;
+    int                dim_ind, dim;
  
-    /* With sorting (!bCompact) the indices are now only partially up to date
-     * and ncg_home and nat_home are not the real count, since there are
-     * "holes" in the arrays for the charge groups that moved to neighbors.
-     */
-    if (fr->cutoff_scheme == ecutsVERLET)
+    comm = dd->comm;
+
+    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
      {
-        moved = get_moved(comm, home_pos_cg);
+        dim = dd->dim[dim_ind];
  
-        for (i = dd->ncg_home; i < home_pos_cg; i++)
+        /* Without PBC we don't have restrictions on the outer cells */
+        if (!(dim >= ddbox->npbcdim &&
+              (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
+            isDlbOn(comm) &&
+            (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
+            comm->cellsize_min[dim])
          {
-            moved[i] = 0;
+            char buf[22];
+            gmx_fatal(FARGS, "step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
+                      gmx_step_str(step, buf), dim2char(dim),
+                      comm->cell_x1[dim] - comm->cell_x0[dim],
+                      ddbox->skew_fac[dim],
+                      dd->comm->cellsize_min[dim],
+                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
          }
      }
-    dd->ncg_home = home_pos_cg;
-    dd->nat_home = home_pos_at;
-
-    if (fr->cutoff_scheme == ecutsGROUP && !bCompact)
-    {
-        /* We overallocated before, we need to set the right size here */
-        dd_resize_state(state, f, dd->nat_home);
-    }
  
-    if (debug)
+    if ((isDlbOn(dd->comm) && dd->ndim > 1) || ddbox->nboundeddim < DIM)
      {
-        fprintf(debug,
-                "Finished repartitioning: cgs moved out %d, new home %d\n",
-                *ncg_moved, dd->ncg_home-*ncg_moved);
-
+        /* Communicate the boundaries and update cell_ns_x0/1 */
+        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
+        if (isDlbOn(dd->comm) && dd->ndim > 1)
+        {
+            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
+        }
      }
  }
  
@@ -4882,8 +2098,6 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
  {
      gmx_domdec_comm_t *comm;
      domdec_load_t     *load;
-    domdec_root_t     *root = nullptr;
-    int                d, dim, i, pos;
      float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
      gmx_bool           bSepPME;
  
@@ -4905,9 +2119,10 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
          comm->load[0].pme = comm->cycl[ddCyclPME];
      }
  
-    for (d = dd->ndim-1; d >= 0; d--)
+    for (int d = dd->ndim - 1; d >= 0; d--)
      {
-        dim = dd->dim[d];
+        const DDCellsizesWithDlb &cellsizes = comm->cellsizesWithDlb[d];
+        const int                 dim       = dd->dim[d];
          /* Check if we participate in the communication in this dimension */
          if (d == dd->ndim-1 ||
              (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
@@ -4915,9 +2130,9 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
              load = &comm->load[d];
              if (isDlbOn(dd->comm))
              {
-                cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
+                cell_frac = cellsizes.fracUpper - cellsizes.fracLower;
              }
-            pos = 0;
+            int pos = 0;
              if (d == dd->ndim-1)
              {
                  sbuf[pos++] = dd_force_load(comm);
@@ -4928,8 +2143,8 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                      sbuf[pos++] = cell_frac;
                      if (d > 0)
                      {
-                        sbuf[pos++] = comm->cell_f_max0[d];
-                        sbuf[pos++] = comm->cell_f_min1[d];
+                        sbuf[pos++] = cellsizes.fracLowerMax;
+                        sbuf[pos++] = cellsizes.fracUpperMin;
                      }
                  }
                  if (bSepPME)
@@ -4949,8 +2164,8 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                      sbuf[pos++] = comm->load[d+1].flags;
                      if (d > 0)
                      {
-                        sbuf[pos++] = comm->cell_f_max0[d];
-                        sbuf[pos++] = comm->cell_f_min1[d];
+                        sbuf[pos++] = cellsizes.fracLowerMax;
+                        sbuf[pos++] = cellsizes.fracUpperMin;
                      }
                  }
                  if (bSepPME)
@@ -4970,10 +2185,12 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
  #endif
              if (dd->ci[dim] == dd->master_ci[dim])
              {
-                /* We are the root, process this row */
+                /* We are the master along this row, process this row */
+                RowMaster *rowMaster = nullptr;
+
                  if (isDlbOn(comm))
                  {
-                    root = comm->root[d];
+                    rowMaster = cellsizes.rowMaster.get();
                  }
                  load->sum      = 0;
                  load->max      = 0;
@@ -4982,15 +2199,15 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                  load->flags    = 0;
                  load->mdf      = 0;
                  load->pme      = 0;
-                pos            = 0;
-                for (i = 0; i < dd->nc[dim]; i++)
+                int pos        = 0;
+                for (int i = 0; i < dd->nc[dim]; i++)
                  {
                      load->sum += load->load[pos++];
                      load->max  = std::max(load->max, load->load[pos]);
                      pos++;
                      if (isDlbOn(dd->comm))
                      {
-                        if (root->bLimited)
+                        if (rowMaster->dlbIsLimited)
                          {
                              /* This direction could not be load balanced properly,
                               * therefore we need to use the maximum iso the average load.
@@ -5006,12 +2223,12 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                          pos++;
                          if (d < dd->ndim-1)
                          {
-                            load->flags = (int)(load->load[pos++] + 0.5);
+                            load->flags = static_cast<int>(load->load[pos++] + 0.5);
                          }
                          if (d > 0)
                          {
-                            root->cell_f_max0[i] = load->load[pos++];
-                            root->cell_f_min1[i] = load->load[pos++];
+                            rowMaster->bounds[i].cellFracLowerMax = load->load[pos++];
+                            rowMaster->bounds[i].cellFracUpperMin = load->load[pos++];
                          }
                      }
                      if (bSepPME)
@@ -5022,7 +2239,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
                          pos++;
                      }
                  }
-                if (isDlbOn(comm) && root->bLimited)
+                if (isDlbOn(comm) && rowMaster->dlbIsLimited)
                  {
                      load->sum_m *= dd->nc[dim];
                      load->flags |= (1<<d);
@@ -5039,7 +2256,7 @@ static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
          comm->load_max   += comm->load[0].max;
          if (isDlbOn(comm))
          {
-            for (d = 0; d < dd->ndim; d++)
+            for (int d = 0; d < dd->ndim; d++)
              {
                  if (comm->load[0].flags & (1<<d))
                  {
@@ -5117,7 +2334,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
          lossFraction    = dd_force_imb_perf_loss(dd);
  
          std::string msg         = "\n Dynamic load balancing report:\n";
-        std::string dlbStateStr = "";
+        std::string dlbStateStr;
  
          switch (dd->comm->dlbState)
          {
@@ -5193,7 +2410,7 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
          sprintf(buf, " Average PME mesh/force load: %5.3f\n", pmeForceRatio);
          fprintf(fplog, "%s", buf);
          fprintf(stderr, "%s", buf);
-        sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossFractionPme)*100);
+        sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", std::fabs(lossFractionPme)*100);
          fprintf(fplog, "%s", buf);
          fprintf(stderr, "%s", buf);
      }
@@ -5216,14 +2433,14 @@ static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
          fprintf(fplog, "%s\n", buf);
          fprintf(stderr, "%s\n", buf);
      }
-    if (numPmeRanks > 0 && fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
+    if (numPmeRanks > 0 && std::fabs(lossFractionPme) >= DD_PERF_LOSS_WARN)
      {
          sprintf(buf,
                  "NOTE: %.1f %% performance was lost because the PME ranks\n"
                  "      had %s work to do than the PP ranks.\n"
                  "      You might want to %s the number of PME ranks\n"
                  "      or %s the cut-off and the grid spacing.\n",
-                fabs(lossFractionPme*100),
+                std::fabs(lossFractionPme*100),
                  (lossFractionPme < 0) ? "less"     : "more",
                  (lossFractionPme < 0) ? "decrease" : "increase",
                  (lossFractionPme < 0) ? "decrease" : "increase");
@@ -5237,7 +2454,7 @@ static float dd_vol_min(gmx_domdec_t *dd)
      return dd->comm->load[0].cvol_min*dd->nnodes;
  }
  
-static gmx_bool dd_load_flags(gmx_domdec_t *dd)
+static int dd_load_flags(gmx_domdec_t *dd)
  {
      return dd->comm->load[0].flags;
  }
@@ -5270,7 +2487,7 @@ float dd_pme_f_ratio(gmx_domdec_t *dd)
      }
  }
  
-static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
+static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, int64_t step)
  {
      int  flags, d;
      char buf[22];
@@ -5315,7 +2532,7 @@ static void dd_print_load_verbose(gmx_domdec_t *dd)
      }
      if (dd->nnodes > 1)
      {
-        fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
+        fprintf(stderr, "imb F %2d%% ", static_cast<int>(dd_f_imbal(dd)*100+0.5));
      }
      if (dd->comm->cycl_n[ddCyclPME])
      {
@@ -5329,7 +2546,6 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
      MPI_Comm           c_row;
      int                dim, i, rank;
      ivec               loc_c;
-    domdec_root_t     *root;
      gmx_bool           bPartOfGroup = FALSE;
  
      dim = dd->dim[dim_ind];
@@ -5351,27 +2567,27 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
          dd->comm->mpi_comm_load[dim_ind] = c_row;
          if (!isDlbDisabled(dd->comm))
          {
+            DDCellsizesWithDlb &cellsizes = dd->comm->cellsizesWithDlb[dim_ind];
+
              if (dd->ci[dim] == dd->master_ci[dim])
              {
                  /* This is the root process of this row */
-                snew(dd->comm->root[dim_ind], 1);
-                root = dd->comm->root[dim_ind];
-                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
-                snew(root->old_cell_f, dd->nc[dim]+1);
-                snew(root->bCellMin, dd->nc[dim]);
+                cellsizes.rowMaster  = gmx::compat::make_unique<RowMaster>();
+
+                RowMaster &rowMaster = *cellsizes.rowMaster;
+                rowMaster.cellFrac.resize(ddCellFractionBufferSize(dd, dim_ind));
+                rowMaster.oldCellFrac.resize(dd->nc[dim] + 1);
+                rowMaster.isCellMin.resize(dd->nc[dim]);
                  if (dim_ind > 0)
                  {
-                    snew(root->cell_f_max0, dd->nc[dim]);
-                    snew(root->cell_f_min1, dd->nc[dim]);
-                    snew(root->bound_min, dd->nc[dim]);
-                    snew(root->bound_max, dd->nc[dim]);
+                    rowMaster.bounds.resize(dd->nc[dim]);
                  }
-                snew(root->buf_ncd, dd->nc[dim]);
+                rowMaster.buf_ncd.resize(dd->nc[dim]);
              }
              else
              {
                  /* This is not a root process, we only need to receive cell_f */
-                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
+                cellsizes.fracRow.resize(ddCellFractionBufferSize(dd, dim_ind));
              }
          }
          if (dd->ci[dim] == dd->master_ci[dim])
@@ -5410,8 +2626,10 @@ void dd_setup_dlb_resource_sharing(t_commrec            *cr,
      }
      /* Split the PP communicator over the physical nodes */
      /* TODO: See if we should store this (before), as it's also used for
-     * for the nodecomm summution.
+     * for the nodecomm summation.
       */
+    // TODO PhysicalNodeCommunicator could be extended/used to handle
+    // the need for per-node per-group communicators.
      MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
                     &mpi_comm_pp_physicalnode);
      MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
@@ -5588,7 +2806,7 @@ static void setup_neighbor_relations(gmx_domdec_t *dd)
  
      if (!isDlbDisabled(dd->comm))
      {
-        snew(dd->comm->root, dd->ndim);
+        dd->comm->cellsizesWithDlb.resize(dd->ndim);
      }
  
      if (dd->comm->bRecordLoad)
@@ -5600,7 +2818,7 @@ static void setup_neighbor_relations(gmx_domdec_t *dd)
  static void make_pp_communicator(FILE                 *fplog,
                                   gmx_domdec_t         *dd,
                                   t_commrec gmx_unused *cr,
-                                 int gmx_unused        reorder)
+                                 bool gmx_unused       reorder)
  {
  #if GMX_MPI
      gmx_domdec_comm_t *comm;
@@ -5623,7 +2841,7 @@ static void make_pp_communicator(FILE                 *fplog,
          {
              periods[i] = TRUE;
          }
-        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
+        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, static_cast<int>(reorder),
                          &comm_cart);
          /* We overwrite the old communicator with the new cartesian one */
          cr->mpi_comm_mygroup = comm_cart;
@@ -5747,40 +2965,9 @@ static void receive_ddindex2simnodeid(gmx_domdec_t         *dd,
  #endif
  }
  
-static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
-                                                     int ncg, int natoms)
-{
-    gmx_domdec_master_t *ma;
-    int                  i;
-
-    snew(ma, 1);
-
-    snew(ma->ncg, dd->nnodes);
-    snew(ma->index, dd->nnodes+1);
-    snew(ma->cg, ncg);
-    snew(ma->nat, dd->nnodes);
-    snew(ma->ibuf, dd->nnodes*2);
-    snew(ma->cell_x, DIM);
-    for (i = 0; i < DIM; i++)
-    {
-        snew(ma->cell_x[i], dd->nc[i]+1);
-    }
-
-    if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
-    {
-        ma->vbuf = nullptr;
-    }
-    else
-    {
-        snew(ma->vbuf, natoms);
-    }
-
-    return ma;
-}
-
  static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
                                 DdRankOrder gmx_unused rankOrder,
-                               int gmx_unused reorder)
+                               bool gmx_unused reorder)
  {
      gmx_domdec_comm_t *comm;
      int                i;
@@ -5843,7 +3030,7 @@ static void split_communicator(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
          {
              periods[i] = TRUE;
          }
-        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
+        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, static_cast<int>(reorder),
                          &comm_cart);
          MPI_Comm_rank(comm_cart, &rank);
          if (MASTER(cr) && rank != 0)
@@ -5936,7 +3123,7 @@ static void make_dd_communicators(FILE *fplog, t_commrec *cr,
                                    gmx_domdec_t *dd, DdRankOrder ddRankOrder)
  {
      gmx_domdec_comm_t *comm;
-    int                CartReorder;
+    bool               CartReorder;
  
      comm = dd->comm;
  
@@ -5949,7 +3136,7 @@ static void make_dd_communicators(FILE *fplog, t_commrec *cr,
       * Real reordering is only supported on very few architectures,
       * Blue Gene is one of them.
       */
-    CartReorder = (getenv("GMX_NO_CART_REORDER") == nullptr);
+    CartReorder = getenv("GMX_NO_CART_REORDER") == nullptr;
  
      if (cr->npmenodes > 0)
      {
@@ -5989,8 +3176,8 @@ static void make_dd_communicators(FILE *fplog, t_commrec *cr,
          dd->pme_receive_vir_ener = receive_vir_ener(dd, cr);
          if (debug)
          {
-            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
-                    dd->pme_nodeid, dd->pme_receive_vir_ener);
+            fprintf(debug, "My pme_nodeid %d receive ener %s\n",
+                    dd->pme_nodeid, gmx::boolToString(dd->pme_receive_vir_ener));
          }
      }
      else
@@ -6000,9 +3187,9 @@ static void make_dd_communicators(FILE *fplog, t_commrec *cr,
  
      if (DDMASTER(dd))
      {
-        dd->ma = init_gmx_domdec_master_t(dd,
-                                          comm->cgs_gl.nr,
-                                          comm->cgs_gl.index[comm->cgs_gl.nr]);
+        dd->ma = gmx::compat::make_unique<AtomDistribution>(dd->nc,
+                                                            comm->cgs_gl.nr,
+                                                            comm->cgs_gl.index[comm->cgs_gl.nr]);
      }
  }
  
@@ -6060,7 +3247,7 @@ static int multi_body_bondeds_count(const gmx_mtop_t *mtop)
  {
      int                  n, nmol, ftype;
      gmx_mtop_ilistloop_t iloop;
-    t_ilist             *il;
+    const t_ilist       *il;
  
      n     = 0;
      iloop = gmx_mtop_ilistloop_init(mtop);
@@ -6102,7 +3289,7 @@ static int dd_getenv(FILE *fplog, const char *env_var, int def)
      return nst;
  }
  
-static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
+static void dd_warning(const t_commrec *cr, FILE *fplog, const char *warn_string)
  {
      if (MASTER(cr))
      {
@@ -6167,7 +3354,7 @@ static int forceDlbOffOrBail(int                cmdlineDlbState,
  
      if (cmdlineDlbState == edlbsOnUser)
      {
-        gmx_fatal(FARGS, (dlbNotSupportedErr + reasonStr).c_str());
+        gmx_fatal(FARGS, "%s", (dlbNotSupportedErr + reasonStr).c_str());
      }
      else if (cmdlineDlbState == edlbsOffCanTurnOn)
      {
@@ -6239,16 +3426,13 @@ static int determineInitialDlbState(FILE *fplog, t_commrec *cr,
                  break;
              case edlbsOffCanTurnOn:
                  return forceDlbOffOrBail(dlbState, reasonStr, cr, fplog);
-                break;
              case edlbsOnCanTurnOff:
                  GMX_RELEASE_ASSERT(false, "edlbsOffCanTurnOff is not a valid initial state");
                  break;
              case edlbsOnUser:
                  return forceDlbOffOrBail(dlbState, reasonStr + " In load balanced runs binary reproducibility cannot be ensured.", cr, fplog);
-                break;
              default:
                  gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", dlbState);
-                break;
          }
      }
  
@@ -6257,8 +3441,6 @@ static int determineInitialDlbState(FILE *fplog, t_commrec *cr,
  
  static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
  {
-    int dim;
-
      dd->ndim = 0;
      if (getenv("GMX_DD_ORDER_ZYX") != nullptr)
      {
@@ -6267,7 +3449,7 @@ static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
          {
              fprintf(fplog, "Using domain decomposition order z, y, x\n");
          }
-        for (dim = DIM-1; dim >= 0; dim--)
+        for (int dim = DIM-1; dim >= 0; dim--)
          {
              if (dd->nc[dim] > 1)
              {
@@ -6278,7 +3460,7 @@ static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
      else
      {
          /* Decomposition order x,y,z */
-        for (dim = 0; dim < DIM; dim++)
+        for (int dim = 0; dim < DIM; dim++)
          {
              if (dd->nc[dim] > 1)
              {
@@ -6286,31 +3468,24 @@ static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
              }
          }
      }
-}
-
-static gmx_domdec_comm_t *init_dd_comm()
-{
-    gmx_domdec_comm_t *comm;
-    int                i;
  
-    snew(comm, 1);
-    snew(comm->cggl_flag, DIM*2);
-    snew(comm->cgcm_state, DIM*2);
-    for (i = 0; i < DIM*2; i++)
+    if (dd->ndim == 0)
      {
-        comm->cggl_flag_nalloc[i]  = 0;
-        comm->cgcm_state_nalloc[i] = 0;
+        /* Set dim[0] to avoid extra checks on ndim in several places */
+        dd->dim[0] = XX;
      }
+}
  
-    comm->nalloc_int = 0;
-    comm->buf_int    = nullptr;
+static gmx_domdec_comm_t *init_dd_comm()
+{
+    gmx_domdec_comm_t *comm = new gmx_domdec_comm_t;
  
-    vec_rvec_init(&comm->vbuf);
+    comm->n_load_have      = 0;
+    comm->n_load_collect   = 0;
  
-    comm->n_load_have    = 0;
-    comm->n_load_collect = 0;
+    comm->haveTurnedOffDlb = false;
  
-    for (i = 0; i < ddnatNR-ddnatZONE; i++)
+    for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
      {
          comm->sum_nat[i] = 0;
      }
@@ -6335,18 +3510,15 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
                                     const MdrunOptions &mdrunOptions,
                                     const gmx_mtop_t *mtop,
                                     const t_inputrec *ir,
-                                   const matrix box, const rvec *xGlobal,
-                                   gmx_ddbox_t *ddbox,
-                                   int *npme_x, int *npme_y)
+                                   const matrix box,
+                                   gmx::ArrayRef<const gmx::RVec> xGlobal,
+                                   gmx_ddbox_t *ddbox)
  {
      real               r_bonded         = -1;
      real               r_bonded_limit   = -1;
      const real         tenPercentMargin = 1.1;
      gmx_domdec_comm_t *comm             = dd->comm;
  
-    snew(comm->cggl_flag, DIM*2);
-    snew(comm->cgcm_state, DIM*2);
-
      dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
      dd->bScrewPBC = (ir->ePBC == epbcSCREW);
  
@@ -6371,11 +3543,11 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
      comm->bPMELoadBalDLBLimits = FALSE;
  
      /* Allocate the charge group/atom sorting struct */
-    snew(comm->sort, 1);
+    comm->sort = gmx::compat::make_unique<gmx_domdec_sort_t>();
  
      comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
  
-    comm->bInterCGBondeds = ((ncg_mtop(mtop) > mtop->mols.nr) ||
+    comm->bInterCGBondeds = ((ncg_mtop(mtop) > gmx_mtop_num_molecules(*mtop)) ||
                               mtop->bIntermolecularInteractions);
      if (comm->bInterCGBondeds)
      {
@@ -6386,8 +3558,8 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
          comm->bInterCGMultiBody = FALSE;
      }
  
-    dd->bInterCGcons    = inter_charge_group_constraints(mtop);
-    dd->bInterCGsettles = inter_charge_group_settles(mtop);
+    dd->bInterCGcons    = gmx::inter_charge_group_constraints(*mtop);
+    dd->bInterCGsettles = gmx::inter_charge_group_settles(*mtop);
  
      if (ir->rlist == 0)
      {
@@ -6463,7 +3635,7 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
  
              if (MASTER(cr))
              {
-                dd_bonded_cg_distance(fplog, mtop, ir, xGlobal, box,
+                dd_bonded_cg_distance(fplog, mtop, ir, as_rvec_array(xGlobal.data()), box,
                                        options.checkBondedInteractions,
                                        &r_2b, &r_mb);
              }
@@ -6511,7 +3683,7 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
      if (dd->bInterCGcons && options.constraintCommunicationRange <= 0)
      {
          /* There is a cell size limit due to the constraints (P-LINCS) */
-        rconstr = constr_r_max(fplog, mtop, ir);
+        rconstr = gmx::constr_r_max(fplog, mtop, ir);
          if (fplog)
          {
              fprintf(fplog,
@@ -6542,7 +3714,7 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
      {
          copy_ivec(options.numCells, dd->nc);
          set_dd_dim(fplog, dd);
-        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, xGlobal, ddbox);
+        set_ddbox_cr(cr, &dd->nc, ir, box, xGlobal, ddbox);
  
          if (options.numPmeRanks >= 0)
          {
@@ -6571,7 +3743,7 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
      }
      else
      {
-        set_ddbox_cr(cr, nullptr, ir, box, &comm->cgs_gl, xGlobal, ddbox);
+        set_ddbox_cr(cr, nullptr, ir, box, xGlobal, ddbox);
  
          /* We need to choose the optimal DD grid and possibly PME nodes */
          real limit =
@@ -6676,12 +3848,6 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
          comm->npmenodes_y   = 0;
      }
  
-    /* Technically we don't need both of these,
-     * but it simplifies code not having to recalculate it.
-     */
-    *npme_x = comm->npmenodes_x;
-    *npme_y = comm->npmenodes_y;
-
      snew(comm->slb_frac, DIM);
      if (isDlbDisabled(comm))
      {
@@ -6733,9 +3899,9 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
  
      if (debug)
      {
-        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
+        fprintf(debug, "Bonded atom communication beyond the cut-off: %s\n"
                  "cellsize limit %f\n",
-                comm->bBondComm, comm->cellsize_limit);
+                gmx::boolToString(comm->bBondComm), comm->cellsize_limit);
      }
  
      if (MASTER(cr))
@@ -6747,29 +3913,24 @@ static void set_dd_limits_and_grid(FILE *fplog, t_commrec *cr, gmx_domdec_t *dd,
  static void set_dlb_limits(gmx_domdec_t *dd)
  
  {
-    int d;
-
-    for (d = 0; d < dd->ndim; d++)
+    for (int d = 0; d < dd->ndim; d++)
      {
-        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
+        /* Set the number of pulses to the value for DLB */
+        dd->comm->cd[d].ind.resize(dd->comm->cd[d].np_dlb);
+
          dd->comm->cellsize_min[dd->dim[d]] =
              dd->comm->cellsize_min_dlb[dd->dim[d]];
      }
  }
  
  
-static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+static void turn_on_dlb(FILE *fplog, const t_commrec *cr, int64_t step)
  {
-    gmx_domdec_t      *dd;
-    gmx_domdec_comm_t *comm;
-    real               cellsize_min;
-    int                d, nc, i;
-
-    dd   = cr->dd;
-    comm = dd->comm;
+    gmx_domdec_t      *dd           = cr->dd;
+    gmx_domdec_comm_t *comm         = dd->comm;
  
-    cellsize_min = comm->cellsize_min[dd->dim[0]];
-    for (d = 1; d < dd->ndim; d++)
+    real               cellsize_min = comm->cellsize_min[dd->dim[0]];
+    for (int d = 1; d < dd->ndim; d++)
      {
          cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
      }
@@ -6777,7 +3938,7 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
      /* Turn off DLB if we're too close to the cell size limit. */
      if (cellsize_min < comm->cellsize_limit*1.05)
      {
-        auto str = gmx::formatString("step %" GMX_PRId64 " Measured %.1f %% performance loss due to load imbalance, "
+        auto str = gmx::formatString("step %" PRId64 " Measured %.1f %% performance loss due to load imbalance, "
                                       "but the minimum cell size is smaller than 1.05 times the cell size limit."
                                       "Will no longer try dynamic load balancing.\n", step, dd_force_imb_perf_loss(dd)*100);
          dd_warning(cr, fplog, str.c_str());
@@ -6787,7 +3948,7 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
      }
  
      char buf[STRLEN];
-    sprintf(buf, "step %" GMX_PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
+    sprintf(buf, "step %" PRId64 " Turning on dynamic load balancing, because the performance loss due to load imbalance is %.1f %%.\n", step, dd_force_imb_perf_loss(dd)*100);
      dd_warning(cr, fplog, buf);
      comm->dlbState = edlbsOnCanTurnOff;
  
@@ -6803,44 +3964,46 @@ static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
       * so we do not need to communicate this.
       * The grid is completely uniform.
       */
-    for (d = 0; d < dd->ndim; d++)
+    for (int d = 0; d < dd->ndim; d++)
      {
-        if (comm->root[d])
+        RowMaster *rowMaster = comm->cellsizesWithDlb[d].rowMaster.get();
+
+        if (rowMaster)
          {
              comm->load[d].sum_m = comm->load[d].sum;
  
-            nc = dd->nc[dd->dim[d]];
-            for (i = 0; i < nc; i++)
+            int nc = dd->nc[dd->dim[d]];
+            for (int i = 0; i < nc; i++)
              {
-                comm->root[d]->cell_f[i]    = i/(real)nc;
+                rowMaster->cellFrac[i] = i/static_cast<real>(nc);
                  if (d > 0)
                  {
-                    comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
-                    comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
+                    rowMaster->bounds[i].cellFracLowerMax =  i     /static_cast<real>(nc);
+                    rowMaster->bounds[i].cellFracUpperMin = (i + 1)/static_cast<real>(nc);
                  }
              }
-            comm->root[d]->cell_f[nc] = 1.0;
+            rowMaster->cellFrac[nc] = 1.0;
          }
      }
  }
  
-static void turn_off_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+static void turn_off_dlb(FILE *fplog, const t_commrec *cr, int64_t step)
  {
      gmx_domdec_t *dd = cr->dd;
  
      char          buf[STRLEN];
-    sprintf(buf, "step %" GMX_PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
+    sprintf(buf, "step %" PRId64 " Turning off dynamic load balancing, because it is degrading performance.\n", step);
      dd_warning(cr, fplog, buf);
      dd->comm->dlbState                     = edlbsOffCanTurnOn;
      dd->comm->haveTurnedOffDlb             = true;
      dd->comm->ddPartioningCountFirstDlbOff = dd->ddp_count;
  }
  
-static void turn_off_dlb_forever(FILE *fplog, t_commrec *cr, gmx_int64_t step)
+static void turn_off_dlb_forever(FILE *fplog, const t_commrec *cr, int64_t step)
  {
      GMX_RELEASE_ASSERT(cr->dd->comm->dlbState == edlbsOffCanTurnOn, "Can only turn off DLB forever when it was in the can-turn-on state");
      char buf[STRLEN];
-    sprintf(buf, "step %" GMX_PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
+    sprintf(buf, "step %" PRId64 " Will no longer try dynamic load balancing, as it degraded performance.\n", step);
      dd_warning(cr, fplog, buf);
      cr->dd->comm->dlbState = edlbsOffForever;
  }
@@ -6959,7 +4122,7 @@ static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
          fprintf(fplog, "\n\n");
      }
  
-    gmx_bool bInterCGVsites = count_intercg_vsites(mtop);
+    gmx_bool bInterCGVsites = count_intercg_vsites(mtop) != 0;
  
      if (comm->bInterCGBondeds ||
          bInterCGVsites ||
@@ -7044,7 +4207,7 @@ static void set_cell_limits_dlb(gmx_domdec_t      *dd,
           * Later cellsize_limit is redetermined,
           * so we can not miss interactions due to this rounding.
           */
-        npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
+        npulse = static_cast<int>(0.96 + comm->cutoff/comm->cellsize_limit);
      }
      else
      {
@@ -7059,8 +4222,8 @@ static void set_cell_limits_dlb(gmx_domdec_t      *dd,
          for (d = 0; d < dd->ndim; d++)
          {
              dim      = dd->dim[d];
-            npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
-                             /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
+            npulse_d = static_cast<int>(1 + dd->nc[dim]*comm->cutoff
+                                        /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
              npulse_d_max = std::max(npulse_d_max, npulse_d);
          }
          npulse = std::min(npulse, npulse_d_max);
@@ -7078,9 +4241,7 @@ static void set_cell_limits_dlb(gmx_domdec_t      *dd,
      for (d = 0; d < dd->ndim; d++)
      {
          comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
-        comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
-        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
-        comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
+        comm->maxpulse        = std::max(comm->maxpulse, comm->cd[d].np_dlb);
          if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
          {
              comm->bVacDLBNoLimit = FALSE;
@@ -7180,12 +4341,12 @@ static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
  
      if (ir->ePBC == epbcNONE)
      {
-        vol_frac = 1 - 1/(double)dd->nnodes;
+        vol_frac = 1 - 1/static_cast<double>(dd->nnodes);
      }
      else
      {
          vol_frac =
-            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
+            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/static_cast<double>(dd->nnodes);
      }
      if (debug)
      {
@@ -7193,7 +4354,8 @@ static void set_ddgrid_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
      }
      natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
  
-    dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
+    dd->ga2la  = new gmx_ga2la_t(natoms_tot,
+                                 static_cast<int>(vol_frac*natoms_tot));
  }
  
  /*! \brief Set some important DD parameters that can be modified by env.vars */
@@ -7201,7 +4363,7 @@ static void set_dd_envvar_options(FILE *fplog, gmx_domdec_t *dd, int rank_mysim)
  {
      gmx_domdec_comm_t *comm = dd->comm;
  
-    dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
+    dd->bSendRecv2      = (dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0) != 0);
      comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
      comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
      int recload         = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
@@ -7254,9 +4416,8 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
                                          const gmx_mtop_t *mtop,
                                          const t_inputrec *ir,
                                          const matrix box,
-                                        const rvec *xGlobal,
-                                        gmx_ddbox_t *ddbox,
-                                        int *npme_x, int *npme_y)
+                                        gmx::ArrayRef<const gmx::RVec> xGlobal,
+                                        gmx::LocalAtomSetManager *atomSets)
  {
      gmx_domdec_t      *dd;
  
@@ -7266,23 +4427,27 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
                  "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
      }
  
-    snew(dd, 1);
+    dd = new gmx_domdec_t;
  
      dd->comm = init_dd_comm();
  
+    /* Initialize DD paritioning counters */
+    dd->comm->partition_step = INT_MIN;
+    dd->ddp_count            = 0;
+
      set_dd_envvar_options(fplog, dd, cr->nodeid);
  
+    gmx_ddbox_t ddbox = {0};
      set_dd_limits_and_grid(fplog, cr, dd, options, mdrunOptions,
                             mtop, ir,
                             box, xGlobal,
-                           ddbox,
-                           npme_x, npme_y);
+                           &ddbox);
  
      make_dd_communicators(fplog, cr, dd, options.rankOrder);
  
      if (thisRankHasDuty(cr, DUTY_PP))
      {
-        set_ddgrid_parameters(fplog, dd, options.dlbScaling, mtop, ir, ddbox);
+        set_ddgrid_parameters(fplog, dd, options.dlbScaling, mtop, ir, &ddbox);
  
          setup_neighbor_relations(dd);
      }
@@ -7290,15 +4455,10 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
      /* Set overallocation to avoid frequent reallocation of arrays */
      set_over_alloc_dd(TRUE);
  
-    /* Initialize DD paritioning counters */
-    dd->comm->partition_step = INT_MIN;
-    dd->ddp_count            = 0;
-
-    /* We currently don't know the number of threads yet, we set this later */
-    dd->comm->nth = 0;
-
      clear_dd_cycle_counts(dd);
  
+    dd->atomSets = atomSets;
+
      return dd;
  }
  
@@ -7314,8 +4474,7 @@ static gmx_bool test_dd_cutoff(t_commrec *cr,
  
      dd = cr->dd;
  
-    set_ddbox(dd, FALSE, cr, ir, state->box,
-              TRUE, &dd->comm->cgs_gl, as_rvec_array(state->x.data()), &ddbox);
+    set_ddbox(dd, false, ir, state->box, true, state->x, &ddbox);
  
      LocallyLimited = 0;
  
@@ -7329,7 +4488,7 @@ static gmx_bool test_dd_cutoff(t_commrec *cr,
              inv_cell_size *= DD_PRES_SCALE_MARGIN;
          }
  
-        np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
+        np = 1 + static_cast<int>(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
  
          if (!isDlbDisabled(dd->comm) && (dim < ddbox.npbcdim) && (dd->comm->cd[d].np_dlb > 0))
          {
@@ -7418,7 +4577,7 @@ static void dd_dlb_set_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd, gmx
      {
          dd->comm->bCheckWhetherToTurnDlbOn = bValue;
  
-        if (bValue == TRUE)
+        if (bValue)
          {
              /* Store the DD partitioning count, so we can ignore cycle counts
               * over the next nstlist steps, which are often slower.
@@ -7469,12 +4628,7 @@ static gmx_bool dd_dlb_get_should_check_whether_to_turn_dlb_on(gmx_domdec_t *dd)
      /* We check whether we should use DLB every c_checkTurnDlbOnInterval
       * partitionings (we do not do this every partioning, so that we
       * avoid excessive communication). */
-    if (dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1)
-    {
-        return TRUE;
-    }
-
-    return FALSE;
+    return dd->comm->n_load_have % c_checkTurnDlbOnInterval == c_checkTurnDlbOnInterval - 1;
  }
  
  gmx_bool dd_dlb_is_on(const gmx_domdec_t *dd)
@@ -7509,9 +4663,10 @@ void dd_dlb_unlock(gmx_domdec_t *dd)
  static void merge_cg_buffers(int ncell,
                               gmx_domdec_comm_dim_t *cd, int pulse,
                               int  *ncg_cell,
-                             int  *index_gl, int  *recv_i,
+                             gmx::ArrayRef<int> index_gl,
+                             const int  *recv_i,
                               rvec *cg_cm,    rvec *recv_vr,
-                             int *cgindex,
+                             gmx::ArrayRef<int> cgindex,
                               cginfo_mb_t *cginfo_mb, int *cginfo)
  {
      gmx_domdec_ind_t *ind, *ind_p;
@@ -7590,26 +4745,27 @@ static void merge_cg_buffers(int ncell,
      }
  }
  
-static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
-                               int nzone, int cg0, const int *cgindex)
+static void make_cell2at_index(gmx_domdec_comm_dim_t        *cd,
+                               int                           nzone,
+                               int                           atomGroupStart,
+                               const gmx::RangePartitioning &atomGroups)
  {
-    int cg, zone, p;
-
      /* Store the atom block boundaries for easy copying of communication buffers
       */
-    cg = cg0;
-    for (zone = 0; zone < nzone; zone++)
+    int g = atomGroupStart;
+    for (int zone = 0; zone < nzone; zone++)
      {
-        for (p = 0; p < cd->np; p++)
+        for (gmx_domdec_ind_t &ind : cd->ind)
          {
-            cd->ind[p].cell2at0[zone] = cgindex[cg];
-            cg += cd->ind[p].nrecv[zone];
-            cd->ind[p].cell2at1[zone] = cgindex[cg];
+            const auto range    = atomGroups.subRange(g, g + ind.nrecv[zone]);
+            ind.cell2at0[zone]  = range.begin();
+            ind.cell2at1[zone]  = range.end();
+            g                  += ind.nrecv[zone];
          }
      }
  }
  
-static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
+static gmx_bool missing_link(t_blocka *link, int cg_gl, const char *bLocalCG)
  {
      int      i;
      gmx_bool bMiss;
@@ -7734,35 +4890,33 @@ set_dd_corners(const gmx_domdec_t *dd,
      }
  }
  
-/* Determine which cg's we need to send in this pulse from this zone */
+/* Add the atom groups we need to send in this pulse from this zone to
+ * \p localAtomGroups and \p work
+ */
  static void
  get_zone_pulse_cgs(gmx_domdec_t *dd,
                     int zonei, int zone,
                     int cg0, int cg1,
-                   const int *index_gl,
-                   const int *cgindex,
+                   gmx::ArrayRef<const int> globalAtomGroupIndices,
+                   const gmx::RangePartitioning &atomGroups,
                     int dim, int dim_ind,
                     int dim0, int dim1, int dim2,
                     real r_comm2, real r_bcomm2,
                     matrix box,
-                   ivec tric_dist,
+                   bool distanceIsTriclinic,
                     rvec *normal,
                     real skew_fac2_d, real skew_fac_01,
                     rvec *v_d, rvec *v_0, rvec *v_1,
                     const dd_corners_t *c,
-                   rvec sf2_round,
+                   const rvec sf2_round,
                     gmx_bool bDistBonded,
                     gmx_bool bBondComm,
                     gmx_bool bDist2B,
                     gmx_bool bDistMB,
                     rvec *cg_cm,
-                   int *cginfo,
-                   gmx_domdec_ind_t *ind,
-                   int **ibuf, int *ibuf_nalloc,
-                   vec_rvec_t *vbuf,
-                   int *nsend_ptr,
-                   int *nat_ptr,
-                   int *nsend_z_ptr)
+                   const int *cginfo,
+                   std::vector<int>     *localAtomGroups,
+                   dd_comm_setup_work_t *work)
  {
      gmx_domdec_comm_t *comm;
      gmx_bool           bScrew;
@@ -7771,7 +4925,7 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
      real               r2, rb2, r, tric_sh;
      rvec               rn, rb;
      int                dimd;
-    int                nsend_z, nsend, nat;
+    int                nsend_z, nat;
  
      comm = dd->comm;
  
@@ -7779,15 +4933,17 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
  
      bDistMB_pulse = (bDistMB && bDistBonded);
  
-    nsend_z = 0;
-    nsend   = *nsend_ptr;
-    nat     = *nat_ptr;
+    /* Unpack the work data */
+    std::vector<int>       &ibuf = work->atomGroupBuffer;
+    std::vector<gmx::RVec> &vbuf = work->positionBuffer;
+    nsend_z                      = 0;
+    nat                          = work->nat;
  
      for (cg = cg0; cg < cg1; cg++)
      {
          r2  = 0;
          rb2 = 0;
-        if (tric_dist[dim_ind] == 0)
+        if (!distanceIsTriclinic)
          {
              /* Rectangular direction, easy */
              r = cg_cm[cg][dim] - c->c[dim_ind][zone];
@@ -7958,47 +5114,46 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
                (bDist2B && r2  < r_bcomm2)) &&
               (!bBondComm ||
                (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
-               missing_link(comm->cglink, index_gl[cg],
+               missing_link(comm->cglink, globalAtomGroupIndices[cg],
                              comm->bLocalCG)))))
          {
-            /* Make an index to the local charge groups */
-            if (nsend+1 > ind->nalloc)
-            {
-                ind->nalloc = over_alloc_large(nsend+1);
-                srenew(ind->index, ind->nalloc);
-            }
-            if (nsend+1 > *ibuf_nalloc)
-            {
-                *ibuf_nalloc = over_alloc_large(nsend+1);
-                srenew(*ibuf, *ibuf_nalloc);
-            }
-            ind->index[nsend] = cg;
-            (*ibuf)[nsend]    = index_gl[cg];
+            /* Store the local and global atom group indices and position */
+            localAtomGroups->push_back(cg);
+            ibuf.push_back(globalAtomGroupIndices[cg]);
              nsend_z++;
-            vec_rvec_check_alloc(vbuf, nsend+1);
  
+            rvec posPbc;
              if (dd->ci[dim] == 0)
              {
                  /* Correct cg_cm for pbc */
-                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
+                rvec_add(cg_cm[cg], box[dim], posPbc);
                  if (bScrew)
                  {
-                    vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
-                    vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
+                    posPbc[YY] = box[YY][YY] - posPbc[YY];
+                    posPbc[ZZ] = box[ZZ][ZZ] - posPbc[ZZ];
                  }
              }
              else
              {
-                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
+                copy_rvec(cg_cm[cg], posPbc);
              }
-            nsend++;
-            nat += cgindex[cg+1] - cgindex[cg];
+            vbuf.emplace_back(posPbc[XX], posPbc[YY], posPbc[ZZ]);
+
+            nat += atomGroups.block(cg).size();
          }
      }
  
-    *nsend_ptr   = nsend;
-    *nat_ptr     = nat;
-    *nsend_z_ptr = nsend_z;
+    work->nat        = nat;
+    work->nsend_zone = nsend_z;
+}
+
+static void clearCommSetupData(dd_comm_setup_work_t *work)
+{
+    work->localAtomGroupBuffer.clear();
+    work->atomGroupBuffer.clear();
+    work->positionBuffer.clear();
+    work->nat        = 0;
+    work->nsend_zone = 0;
  }
  
  static void setup_dd_communication(gmx_domdec_t *dd,
@@ -8006,24 +5161,20 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                                     t_forcerec *fr,
                                     t_state *state, PaddedRVecVector *f)
  {
-    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
+    int                    dim_ind, dim, dim0, dim1, dim2, dimd, nat_tot;
      int                    nzone, nzone_send, zone, zonei, cg0, cg1;
      int                    c, i, cg, cg_gl, nrcg;
-    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
+    int                   *zone_cg_range, pos_cg;
      gmx_domdec_comm_t     *comm;
      gmx_domdec_zones_t    *zones;
      gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t      *ind;
      cginfo_mb_t           *cginfo_mb;
      gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
      real                   r_comm2, r_bcomm2;
      dd_corners_t           corners;
-    ivec                   tric_dist;
-    rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr, *recv_vr;
+    rvec                  *cg_cm, *normal, *v_d, *v_0 = nullptr, *v_1 = nullptr;
      real                   skew_fac2_d, skew_fac_01;
      rvec                   sf2_round;
-    int                    nsend, nat;
-    int                    th;
  
      if (debug)
      {
@@ -8032,17 +5183,14 @@ static void setup_dd_communication(gmx_domdec_t *dd,
  
      comm  = dd->comm;
  
-    if (comm->nth == 0)
+    if (comm->dth.empty())
      {
          /* Initialize the thread data.
           * This can not be done in init_domain_decomposition,
           * as the numbers of threads is determined later.
           */
-        comm->nth = gmx_omp_nthreads_get(emntDomdec);
-        if (comm->nth > 1)
-        {
-            snew(comm->dth, comm->nth);
-        }
+        int numThreads = gmx_omp_nthreads_get(emntDomdec);
+        comm->dth.resize(numThreads);
      }
  
      switch (fr->cutoff_scheme)
@@ -8055,20 +5203,6 @@ static void setup_dd_communication(gmx_domdec_t *dd,
              break;
          default:
              gmx_incons("unimplemented");
-            cg_cm = nullptr;
-    }
-
-    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
-    {
-        /* Check if we need to use triclinic distances */
-        tric_dist[dim_ind] = 0;
-        for (i = 0; i <= dim_ind; i++)
-        {
-            if (ddbox->tric_dir[dd->dim[i]])
-            {
-                tric_dist[dim_ind] = 1;
-            }
-        }
      }
  
      bBondComm = comm->bBondComm;
@@ -8084,7 +5218,7 @@ static void setup_dd_communication(gmx_domdec_t *dd,
  
      if (debug)
      {
-        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, std::sqrt(r_bcomm2));
+        fprintf(debug, "bBondComm %s, r_bc %f\n", gmx::boolToString(bBondComm), std::sqrt(r_bcomm2));
      }
  
      zones = &comm->zones;
@@ -8121,8 +5255,6 @@ static void setup_dd_communication(gmx_domdec_t *dd,
      }
  
      zone_cg_range = zones->cg_range;
-    index_gl      = dd->index_gl;
-    cgindex       = dd->cgindex;
      cginfo_mb     = fr->cginfo_mb;
  
      zone_cg_range[0]   = 0;
@@ -8130,13 +5262,23 @@ static void setup_dd_communication(gmx_domdec_t *dd,
      comm->zone_ncg1[0] = dd->ncg_home;
      pos_cg             = dd->ncg_home;
  
-    nat_tot = dd->nat_home;
+    nat_tot = comm->atomRanges.numHomeAtoms();
      nzone   = 1;
      for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
      {
          dim = dd->dim[dim_ind];
          cd  = &comm->cd[dim_ind];
  
+        /* Check if we need to compute triclinic distances along this dim */
+        bool distanceIsTriclinic = false;
+        for (i = 0; i <= dim_ind; i++)
+        {
+            if (ddbox->tric_dir[dd->dim[i]])
+            {
+                distanceIsTriclinic = true;
+            }
+        }
+
          if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
          {
              /* No pbc in this dimension, the first node should not comm. */
@@ -8147,23 +5289,26 @@ static void setup_dd_communication(gmx_domdec_t *dd,
              nzone_send = nzone;
          }
  
-        v_d         = ddbox->v[dim];
-        skew_fac2_d = gmx::square(ddbox->skew_fac[dim]);
+        v_d                = ddbox->v[dim];
+        skew_fac2_d        = gmx::square(ddbox->skew_fac[dim]);
  
-        cd->bInPlace = TRUE;
-        for (p = 0; p < cd->np; p++)
+        cd->receiveInPlace = true;
+        for (int p = 0; p < cd->numPulses(); p++)
          {
              /* Only atoms communicated in the first pulse are used
               * for multi-body bonded interactions or for bBondComm.
               */
              bDistBonded = ((bDistMB || bDist2B) && p == 0);
  
-            ind   = &cd->ind[p];
-            nsend = 0;
-            nat   = 0;
+            gmx_domdec_ind_t *ind = &cd->ind[p];
+
+            /* Thread 0 writes in the global index array */
+            ind->index.clear();
+            clearCommSetupData(&comm->dth[0]);
+
              for (zone = 0; zone < nzone_send; zone++)
              {
-                if (tric_dist[dim_ind] && dim_ind > 0)
+                if (dim_ind > 0 && distanceIsTriclinic)
                  {
                      /* Determine slightly more optimized skew_fac's
                       * for rounding.
@@ -8210,111 +5355,54 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                      cg0 = cg1 - cd->ind[p-1].nrecv[zone];
                  }
  
-#pragma omp parallel for num_threads(comm->nth) schedule(static)
-                for (th = 0; th < comm->nth; th++)
+                const int numThreads = static_cast<int>(comm->dth.size());
+#pragma omp parallel for num_threads(numThreads) schedule(static)
+                for (int th = 0; th < numThreads; th++)
                  {
                      try
                      {
-                        gmx_domdec_ind_t *ind_p;
-                        int             **ibuf_p, *ibuf_nalloc_p;
-                        vec_rvec_t       *vbuf_p;
-                        int              *nsend_p, *nat_p;
-                        int              *nsend_zone_p;
-                        int               cg0_th, cg1_th;
-
-                        if (th == 0)
-                        {
-                            /* Thread 0 writes in the comm buffers */
-                            ind_p         = ind;
-                            ibuf_p        = &comm->buf_int;
-                            ibuf_nalloc_p = &comm->nalloc_int;
-                            vbuf_p        = &comm->vbuf;
-                            nsend_p       = &nsend;
-                            nat_p         = &nat;
-                            nsend_zone_p  = &ind->nsend[zone];
-                        }
-                        else
-                        {
-                            /* Other threads write into temp buffers */
-                            ind_p         = &comm->dth[th].ind;
-                            ibuf_p        = &comm->dth[th].ibuf;
-                            ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
-                            vbuf_p        = &comm->dth[th].vbuf;
-                            nsend_p       = &comm->dth[th].nsend;
-                            nat_p         = &comm->dth[th].nat;
-                            nsend_zone_p  = &comm->dth[th].nsend_zone;
-
-                            comm->dth[th].nsend      = 0;
-                            comm->dth[th].nat        = 0;
-                            comm->dth[th].nsend_zone = 0;
-                        }
+                        dd_comm_setup_work_t &work = comm->dth[th];
  
-                        if (comm->nth == 1)
-                        {
-                            cg0_th = cg0;
-                            cg1_th = cg1;
-                        }
-                        else
+                        /* Retain data accumulated into buffers of thread 0 */
+                        if (th > 0)
                          {
-                            cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
-                            cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
+                            clearCommSetupData(&work);
                          }
  
+                        int cg0_th = cg0 + ((cg1 - cg0)* th   )/numThreads;
+                        int cg1_th = cg0 + ((cg1 - cg0)*(th+1))/numThreads;
+
                          /* Get the cg's for this pulse in this zone */
                          get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
-                                           index_gl, cgindex,
+                                           dd->globalAtomGroupIndices,
+                                           dd->atomGrouping(),
                                             dim, dim_ind, dim0, dim1, dim2,
                                             r_comm2, r_bcomm2,
-                                           box, tric_dist,
+                                           box, distanceIsTriclinic,
                                             normal, skew_fac2_d, skew_fac_01,
                                             v_d, v_0, v_1, &corners, sf2_round,
                                             bDistBonded, bBondComm,
                                             bDist2B, bDistMB,
                                             cg_cm, fr->cginfo,
-                                           ind_p,
-                                           ibuf_p, ibuf_nalloc_p,
-                                           vbuf_p,
-                                           nsend_p, nat_p,
-                                           nsend_zone_p);
+                                           th == 0 ? &ind->index : &work.localAtomGroupBuffer,
+                                           &work);
                      }
                      GMX_CATCH_ALL_AND_EXIT_WITH_FATAL_ERROR;
                  } // END
  
+                std::vector<int>       &atomGroups = comm->dth[0].atomGroupBuffer;
+                std::vector<gmx::RVec> &positions  = comm->dth[0].positionBuffer;
+                ind->nsend[zone]  = comm->dth[0].nsend_zone;
                  /* Append data of threads>=1 to the communication buffers */
-                for (th = 1; th < comm->nth; th++)
+                for (int th = 1; th < numThreads; th++)
                  {
-                    dd_comm_setup_work_t *dth;
-                    int                   i, ns1;
-
-                    dth = &comm->dth[th];
-
-                    ns1 = nsend + dth->nsend_zone;
-                    if (ns1 > ind->nalloc)
-                    {
-                        ind->nalloc = over_alloc_dd(ns1);
-                        srenew(ind->index, ind->nalloc);
-                    }
-                    if (ns1 > comm->nalloc_int)
-                    {
-                        comm->nalloc_int = over_alloc_dd(ns1);
-                        srenew(comm->buf_int, comm->nalloc_int);
-                    }
-                    if (ns1 > comm->vbuf.nalloc)
-                    {
-                        comm->vbuf.nalloc = over_alloc_dd(ns1);
-                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
-                    }
+                    const dd_comm_setup_work_t &dth = comm->dth[th];
  
-                    for (i = 0; i < dth->nsend_zone; i++)
-                    {
-                        ind->index[nsend]    = dth->ind.index[i];
-                        comm->buf_int[nsend] = dth->ibuf[i];
-                        copy_rvec(dth->vbuf.v[i],
-                                  comm->vbuf.v[nsend]);
-                        nsend++;
-                    }
-                    nat              += dth->nat;
-                    ind->nsend[zone] += dth->nsend_zone;
+                    ind->index.insert(ind->index.end(), dth.localAtomGroupBuffer.begin(), dth.localAtomGroupBuffer.end());
+                    atomGroups.insert(atomGroups.end(), dth.atomGroupBuffer.begin(), dth.atomGroupBuffer.end());
+                    positions.insert(positions.end(), dth.positionBuffer.begin(), dth.positionBuffer.end());
+                    comm->dth[0].nat += dth.nat;
+                    ind->nsend[zone] += dth.nsend_zone;
                  }
              }
              /* Clear the counts in case we do not have pbc */
@@ -8322,17 +5410,12 @@ static void setup_dd_communication(gmx_domdec_t *dd,
              {
                  ind->nsend[zone] = 0;
              }
-            ind->nsend[nzone]   = nsend;
-            ind->nsend[nzone+1] = nat;
+            ind->nsend[nzone]     = ind->index.size();
+            ind->nsend[nzone + 1] = comm->dth[0].nat;
              /* Communicate the number of cg's and atoms to receive */
-            dd_sendrecv_int(dd, dim_ind, dddirBackward,
-                            ind->nsend, nzone+2,
-                            ind->nrecv, nzone+2);
-
-            /* The rvec buffer is also required for atom buffers of size nsend
-             * in dd_move_x and dd_move_f.
-             */
-            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
+            ddSendrecv(dd, dim_ind, dddirBackward,
+                       ind->nsend, nzone+2,
+                       ind->nrecv, nzone+2);
  
              if (p > 0)
              {
@@ -8341,45 +5424,37 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                  {
                      if (ind->nrecv[zone] > 0)
                      {
-                        cd->bInPlace = FALSE;
-                    }
-                }
-                if (!cd->bInPlace)
-                {
-                    /* The int buffer is only required here for the cg indices */
-                    if (ind->nrecv[nzone] > comm->nalloc_int2)
-                    {
-                        comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
-                        srenew(comm->buf_int2, comm->nalloc_int2);
+                        cd->receiveInPlace = false;
                      }
-                    /* The rvec buffer is also required for atom buffers
-                     * of size nrecv in dd_move_x and dd_move_f.
-                     */
-                    i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
-                    vec_rvec_check_alloc(&comm->vbuf2, i);
                  }
              }
  
-            /* Make space for the global cg indices */
-            if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
-                || dd->cg_nalloc == 0)
+            int receiveBufferSize = 0;
+            if (!cd->receiveInPlace)
              {
-                dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
-                srenew(index_gl, dd->cg_nalloc);
-                srenew(cgindex, dd->cg_nalloc+1);
+                receiveBufferSize = ind->nrecv[nzone];
              }
+            /* These buffer are actually only needed with in-place */
+            DDBufferAccess<int>       globalAtomGroupBuffer(comm->intBuffer, receiveBufferSize);
+            DDBufferAccess<gmx::RVec> rvecBuffer(comm->rvecBuffer, receiveBufferSize);
+
+            dd_comm_setup_work_t     &work = comm->dth[0];
+
+            /* Make space for the global cg indices */
+            int numAtomGroupsNew = pos_cg + ind->nrecv[nzone];
+            dd->globalAtomGroupIndices.resize(numAtomGroupsNew);
              /* Communicate the global cg indices */
-            if (cd->bInPlace)
+            gmx::ArrayRef<int> integerBufferRef;
+            if (cd->receiveInPlace)
              {
-                recv_i = index_gl + pos_cg;
+                integerBufferRef = gmx::arrayRefFromArray(dd->globalAtomGroupIndices.data() + pos_cg, ind->nrecv[nzone]);
              }
              else
              {
-                recv_i = comm->buf_int2;
+                integerBufferRef = globalAtomGroupBuffer.buffer;
              }
-            dd_sendrecv_int(dd, dim_ind, dddirBackward,
-                            comm->buf_int, nsend,
-                            recv_i,        ind->nrecv[nzone]);
+            ddSendrecv<int>(dd, dim_ind, dddirBackward,
+                            work.atomGroupBuffer, integerBufferRef);
  
              /* Make space for cg_cm */
              dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
@@ -8392,30 +5467,30 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                  cg_cm = as_rvec_array(state->x.data());
              }
              /* Communicate cg_cm */
-            if (cd->bInPlace)
+            gmx::ArrayRef<gmx::RVec> rvecBufferRef;
+            if (cd->receiveInPlace)
              {
-                recv_vr = cg_cm + pos_cg;
+                rvecBufferRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cg_cm + pos_cg), ind->nrecv[nzone]);
              }
              else
              {
-                recv_vr = comm->vbuf2.v;
+                rvecBufferRef = rvecBuffer.buffer;
              }
-            dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
-                             comm->vbuf.v, nsend,
-                             recv_vr,      ind->nrecv[nzone]);
+            ddSendrecv<gmx::RVec>(dd, dim_ind, dddirBackward,
+                                  work.positionBuffer, rvecBufferRef);
  
              /* Make the charge group index */
-            if (cd->bInPlace)
+            if (cd->receiveInPlace)
              {
                  zone = (p == 0 ? 0 : nzone - 1);
                  while (zone < nzone)
                  {
                      for (cg = 0; cg < ind->nrecv[zone]; cg++)
                      {
-                        cg_gl              = index_gl[pos_cg];
-                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
-                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
-                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
+                        cg_gl                              = dd->globalAtomGroupIndices[pos_cg];
+                        fr->cginfo[pos_cg]                 = ddcginfo(cginfo_mb, cg_gl);
+                        nrcg                               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
+                        dd->atomGrouping_.appendBlock(nrcg);
                          if (bBondComm)
                          {
                              /* Update the charge group presence,
@@ -8436,37 +5511,35 @@ static void setup_dd_communication(gmx_domdec_t *dd,
              else
              {
                  /* This part of the code is never executed with bBondComm. */
+                std::vector<int> &atomGroupsIndex = dd->atomGrouping_.rawIndex();
+                atomGroupsIndex.resize(numAtomGroupsNew + 1);
+
                  merge_cg_buffers(nzone, cd, p, zone_cg_range,
-                                 index_gl, recv_i, cg_cm, recv_vr,
-                                 cgindex, fr->cginfo_mb, fr->cginfo);
+                                 dd->globalAtomGroupIndices, integerBufferRef.data(),
+                                 cg_cm, as_rvec_array(rvecBufferRef.data()),
+                                 atomGroupsIndex,
+                                 fr->cginfo_mb, fr->cginfo);
                  pos_cg += ind->nrecv[nzone];
              }
              nat_tot += ind->nrecv[nzone+1];
          }
-        if (!cd->bInPlace)
+        if (!cd->receiveInPlace)
          {
              /* Store the atom block for easy copying of communication buffers */
-            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
+            make_cell2at_index(cd, nzone, zone_cg_range[nzone], dd->atomGrouping());
          }
          nzone += nzone;
      }
-    dd->index_gl = index_gl;
-    dd->cgindex  = cgindex;
  
-    dd->ncg_tot          = zone_cg_range[zones->n];
-    dd->nat_tot          = nat_tot;
-    comm->nat[ddnatHOME] = dd->nat_home;
-    for (i = ddnatZONE; i < ddnatNR; i++)
-    {
-        comm->nat[i] = dd->nat_tot;
-    }
+    comm->atomRanges.setEnd(DDAtomRanges::Type::Zones, nat_tot);
  
      if (!bBondComm)
      {
          /* We don't need to update cginfo, since that was alrady done above.
           * So we pass NULL for the forcerec.
           */
-        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
+        dd_set_cginfo(dd->globalAtomGroupIndices,
+                      dd->ncg_home, dd->globalAtomGroupIndices.size(),
                        nullptr, comm->bLocalCG);
      }
  
@@ -8633,7 +5706,9 @@ static void set_zones_size(gmx_domdec_t *dd,
          {
              if (zones->shift[zi][dim] == 0)
              {
-                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
+                /* We should only use zones up to zone_end */
+                int jZoneEnd = std::min(zones->izone[zi].j1, zone_end);
+                for (z = zones->izone[zi].j0; z < jZoneEnd; z++)
                  {
                      if (zones->shift[z][dim] > 0)
                      {
@@ -8752,288 +5827,251 @@ static void set_zones_size(gmx_domdec_t *dd,
      }
  }
  
-static int comp_cgsort(const void *a, const void *b)
+static bool comp_cgsort(const gmx_cgsort_t &a, const gmx_cgsort_t &b)
  {
-    int           comp;
  
-    gmx_cgsort_t *cga, *cgb;
-    cga = (gmx_cgsort_t *)a;
-    cgb = (gmx_cgsort_t *)b;
-
-    comp = cga->nsc - cgb->nsc;
-    if (comp == 0)
+    if (a.nsc == b.nsc)
      {
-        comp = cga->ind_gl - cgb->ind_gl;
+        return a.ind_gl < b.ind_gl;
      }
-
-    return comp;
+    return a.nsc < b.nsc;
  }
  
-static void order_int_cg(int n, const gmx_cgsort_t *sort,
-                         int *a, int *buf)
+/* Order data in \p dataToSort according to \p sort
+ *
+ * Note: both buffers should have at least \p sort.size() elements.
+ */
+template <typename T>
+static void
+orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
+            gmx::ArrayRef<T>                   dataToSort,
+            gmx::ArrayRef<T>                   sortBuffer)
  {
-    int i;
+    GMX_ASSERT(dataToSort.size() >= sort.size(), "The vector needs to be sufficiently large");
+    GMX_ASSERT(sortBuffer.size() >= sort.size(), "The sorting buffer needs to be sufficiently large");
  
-    /* Order the data */
-    for (i = 0; i < n; i++)
+    /* Order the data into the temporary buffer */
+    size_t i = 0;
+    for (const gmx_cgsort_t &entry : sort)
      {
-        buf[i] = a[sort[i].ind];
+        sortBuffer[i++] = dataToSort[entry.ind];
      }
  
      /* Copy back to the original array */
-    for (i = 0; i < n; i++)
-    {
-        a[i] = buf[i];
-    }
+    std::copy(sortBuffer.begin(), sortBuffer.begin() + sort.size(),
+              dataToSort.begin());
  }
  
-static void order_vec_cg(int n, const gmx_cgsort_t *sort,
-                         rvec *v, rvec *buf)
+/* Order data in \p dataToSort according to \p sort
+ *
+ * Note: \p vectorToSort should have at least \p sort.size() elements,
+ *       \p workVector is resized when it is too small.
+ */
+template <typename T>
+static void
+orderVector(gmx::ArrayRef<const gmx_cgsort_t>  sort,
+            gmx::ArrayRef<T>                   vectorToSort,
+            std::vector<T>                    *workVector)
  {
-    int i;
-
-    /* Order the data */
-    for (i = 0; i < n; i++)
-    {
-        copy_rvec(v[sort[i].ind], buf[i]);
-    }
-
-    /* Copy back to the original array */
-    for (i = 0; i < n; i++)
+    if (gmx::index(workVector->size()) < sort.size())
      {
-        copy_rvec(buf[i], v[i]);
+        workVector->resize(sort.size());
      }
+    orderVector<T>(sort, vectorToSort, *workVector);
  }
  
-static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
-                           rvec *v, rvec *buf)
+static void order_vec_atom(const gmx::RangePartitioning      *atomGroups,
+                           gmx::ArrayRef<const gmx_cgsort_t>  sort,
+                           gmx::ArrayRef<gmx::RVec>           v,
+                           gmx::ArrayRef<gmx::RVec>           buf)
  {
-    int a, atot, cg, cg0, cg1, i;
-
-    if (cgindex == nullptr)
+    if (atomGroups == nullptr)
      {
          /* Avoid the useless loop of the atoms within a cg */
-        order_vec_cg(ncg, sort, v, buf);
+        orderVector(sort, v, buf);
  
          return;
      }
  
      /* Order the data */
-    a = 0;
-    for (cg = 0; cg < ncg; cg++)
+    int a = 0;
+    for (const gmx_cgsort_t &entry : sort)
      {
-        cg0 = cgindex[sort[cg].ind];
-        cg1 = cgindex[sort[cg].ind+1];
-        for (i = cg0; i < cg1; i++)
+        for (int i : atomGroups->block(entry.ind))
          {
              copy_rvec(v[i], buf[a]);
              a++;
          }
      }
-    atot = a;
+    int atot = a;
  
      /* Copy back to the original array */
-    for (a = 0; a < atot; a++)
+    for (int a = 0; a < atot; a++)
      {
          copy_rvec(buf[a], v[a]);
      }
  }
  
-static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
-                         int nsort_new, gmx_cgsort_t *sort_new,
-                         gmx_cgsort_t *sort1)
+/* Returns whether a < b */
+static bool compareCgsort(const gmx_cgsort_t &a,
+                          const gmx_cgsort_t &b)
  {
-    int i1, i2, i_new;
+    return (a.nsc < b.nsc ||
+            (a.nsc == b.nsc && a.ind_gl < b.ind_gl));
+}
  
+static void orderedSort(gmx::ArrayRef<const gmx_cgsort_t>  stationary,
+                        gmx::ArrayRef<gmx_cgsort_t>        moved,
+                        std::vector<gmx_cgsort_t>         *sort1)
+{
      /* The new indices are not very ordered, so we qsort them */
-    gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
+    std::sort(moved.begin(), moved.end(), comp_cgsort);
  
-    /* sort2 is already ordered, so now we can merge the two arrays */
-    i1    = 0;
-    i2    = 0;
-    i_new = 0;
-    while (i2 < nsort2 || i_new < nsort_new)
-    {
-        if (i2 == nsort2)
-        {
-            sort1[i1++] = sort_new[i_new++];
-        }
-        else if (i_new == nsort_new)
-        {
-            sort1[i1++] = sort2[i2++];
-        }
-        else if (sort2[i2].nsc < sort_new[i_new].nsc ||
-                 (sort2[i2].nsc == sort_new[i_new].nsc &&
-                  sort2[i2].ind_gl < sort_new[i_new].ind_gl))
-        {
-            sort1[i1++] = sort2[i2++];
-        }
-        else
-        {
-            sort1[i1++] = sort_new[i_new++];
-        }
-    }
+    /* stationary is already ordered, so now we can merge the two arrays */
+    sort1->resize(stationary.size() + moved.size());
+    std::merge(stationary.begin(), stationary.end(),
+               moved.begin(), moved.end(),
+               sort1->begin(),
+               compareCgsort);
  }
  
-static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
+/* Set the sorting order for systems with charge groups, returned in sort->sort.
+ * The order is according to the global charge group index.
+ * This adds and removes charge groups that moved between domains.
+ */
+static void dd_sort_order(const gmx_domdec_t *dd,
+                          const t_forcerec   *fr,
+                          int                 ncg_home_old,
+                          gmx_domdec_sort_t  *sort)
  {
-    gmx_domdec_sort_t *sort;
-    gmx_cgsort_t      *cgsort, *sort_i;
-    int                ncg_new, nsort2, nsort_new, i, *a, moved;
-
-    sort = dd->comm->sort;
+    const int *a          = fr->ns->grid->cell_index;
  
-    a = fr->ns->grid->cell_index;
-
-    moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
+    const int  movedValue = NSGRID_SIGNAL_MOVED_FAC*fr->ns->grid->ncells;
  
      if (ncg_home_old >= 0)
      {
+        std::vector<gmx_cgsort_t> &stationary = sort->stationary;
+        std::vector<gmx_cgsort_t> &moved      = sort->moved;
+
          /* The charge groups that remained in the same ns grid cell
           * are completely ordered. So we can sort efficiently by sorting
           * the charge groups that did move into the stationary list.
+         * Note: push_back() seems to be slightly slower than direct access.
           */
-        ncg_new   = 0;
-        nsort2    = 0;
-        nsort_new = 0;
-        for (i = 0; i < dd->ncg_home; i++)
+        stationary.clear();
+        moved.clear();
+        for (int i = 0; i < dd->ncg_home; i++)
          {
              /* Check if this cg did not move to another node */
-            if (a[i] < moved)
+            if (a[i] < movedValue)
              {
-                if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
+                gmx_cgsort_t entry;
+                entry.nsc    = a[i];
+                entry.ind_gl = dd->globalAtomGroupIndices[i];
+                entry.ind    = i;
+
+                if (i >= ncg_home_old || a[i] != sort->sorted[i].nsc)
                  {
                      /* This cg is new on this node or moved ns grid cell */
-                    if (nsort_new >= sort->sort_new_nalloc)
-                    {
-                        sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
-                        srenew(sort->sort_new, sort->sort_new_nalloc);
-                    }
-                    sort_i = &(sort->sort_new[nsort_new++]);
+                    moved.push_back(entry);
                  }
                  else
                  {
                      /* This cg did not move */
-                    sort_i = &(sort->sort2[nsort2++]);
+                    stationary.push_back(entry);
                  }
-                /* Sort on the ns grid cell indices
-                 * and the global topology index.
-                 * index_gl is irrelevant with cell ns,
-                 * but we set it here anyhow to avoid a conditional.
-                 */
-                sort_i->nsc    = a[i];
-                sort_i->ind_gl = dd->index_gl[i];
-                sort_i->ind    = i;
-                ncg_new++;
              }
          }
+
          if (debug)
          {
-            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
-                    nsort2, nsort_new);
+            fprintf(debug, "ordered sort cgs: stationary %zu moved %zu\n",
+                    stationary.size(), moved.size());
          }
          /* Sort efficiently */
-        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
-                     sort->sort);
+        orderedSort(stationary, moved, &sort->sorted);
      }
      else
      {
-        cgsort  = sort->sort;
-        ncg_new = 0;
-        for (i = 0; i < dd->ncg_home; i++)
+        std::vector<gmx_cgsort_t> &cgsort   = sort->sorted;
+        cgsort.clear();
+        cgsort.reserve(dd->ncg_home);
+        int                        numCGNew = 0;
+        for (int i = 0; i < dd->ncg_home; i++)
          {
              /* Sort on the ns grid cell indices
               * and the global topology index
               */
-            cgsort[i].nsc    = a[i];
-            cgsort[i].ind_gl = dd->index_gl[i];
-            cgsort[i].ind    = i;
-            if (cgsort[i].nsc < moved)
+            gmx_cgsort_t entry;
+            entry.nsc    = a[i];
+            entry.ind_gl = dd->globalAtomGroupIndices[i];
+            entry.ind    = i;
+            cgsort.push_back(entry);
+            if (cgsort[i].nsc < movedValue)
              {
-                ncg_new++;
+                numCGNew++;
              }
          }
          if (debug)
          {
-            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
+            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, numCGNew);
          }
          /* Determine the order of the charge groups using qsort */
-        gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
-    }
+        std::sort(cgsort.begin(), cgsort.end(), comp_cgsort);
  
-    return ncg_new;
+        /* Remove the charge groups which are no longer at home here */
+        cgsort.resize(numCGNew);
+    }
  }
  
-static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
+/* Returns the sorting order for atoms based on the nbnxn grid order in sort */
+static void dd_sort_order_nbnxn(const t_forcerec          *fr,
+                                std::vector<gmx_cgsort_t> *sort)
  {
-    gmx_cgsort_t *sort;
-    int           ncg_new, i, na;
-    const int    *a;
-
-    sort = dd->comm->sort->sort;
+    gmx::ArrayRef<const int> atomOrder = nbnxn_get_atomorder(fr->nbv->nbs.get());
  
-    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
-
-    ncg_new = 0;
-    for (i = 0; i < na; i++)
+    /* Using push_back() instead of this resize results in much slower code */
+    sort->resize(atomOrder.size());
+    gmx::ArrayRef<gmx_cgsort_t> buffer    = *sort;
+    size_t                      numSorted = 0;
+    for (int i : atomOrder)
      {
-        if (a[i] >= 0)
+        if (i >= 0)
          {
-            sort[ncg_new].ind = a[i];
-            ncg_new++;
+            /* The values of nsc and ind_gl are not used in this case */
+            buffer[numSorted++].ind = i;
          }
      }
-
-    return ncg_new;
+    sort->resize(numSorted);
  }
  
  static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
                            int ncg_home_old)
  {
-    gmx_domdec_sort_t *sort;
-    gmx_cgsort_t      *cgsort;
-    int               *cgindex;
-    int                ncg_new, i, *ibuf, cgsize;
-    rvec              *vbuf;
-
-    sort = dd->comm->sort;
-
-    if (dd->ncg_home > sort->sort_nalloc)
-    {
-        sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
-        srenew(sort->sort, sort->sort_nalloc);
-        srenew(sort->sort2, sort->sort_nalloc);
-    }
-    cgsort = sort->sort;
+    gmx_domdec_sort_t *sort = dd->comm->sort.get();
  
      switch (fr->cutoff_scheme)
      {
          case ecutsGROUP:
-            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
+            dd_sort_order(dd, fr, ncg_home_old, sort);
              break;
          case ecutsVERLET:
-            ncg_new = dd_sort_order_nbnxn(dd, fr);
+            dd_sort_order_nbnxn(fr, &sort->sorted);
              break;
          default:
              gmx_incons("unimplemented");
-            ncg_new = 0;
      }
  
+    const gmx::RangePartitioning &atomGrouping = dd->atomGrouping();
+
      /* We alloc with the old size, since cgindex is still old */
-    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
-    vbuf = dd->comm->vbuf.v;
+    GMX_ASSERT(atomGrouping.numBlocks() == dd->ncg_home, "atomGroups and dd should be consistent");
+    DDBufferAccess<gmx::RVec>     rvecBuffer(dd->comm->rvecBuffer, atomGrouping.fullRange().end());
  
-    if (dd->comm->bCGs)
-    {
-        cgindex = dd->cgindex;
-    }
-    else
-    {
-        cgindex = nullptr;
-    }
+    const gmx::RangePartitioning *atomGroupsPtr = (dd->comm->bCGs ? &atomGrouping : nullptr);
  
-    /* Remove the charge groups which are no longer at home here */
-    dd->ncg_home = ncg_new;
+    /* Set the new home atom/charge group count */
+    dd->ncg_home = sort->sorted.size();
      if (debug)
      {
          fprintf(debug, "Set the new home charge group count to %d\n",
@@ -9041,101 +6079,91 @@ static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state
      }
  
      /* Reorder the state */
+    gmx::ArrayRef<const gmx_cgsort_t> cgsort = sort->sorted;
+    GMX_RELEASE_ASSERT(cgsort.size() == dd->ncg_home, "We should sort all the home atom groups");
+
      if (state->flags & (1 << estX))
      {
-        order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->x.data()), vbuf);
+        order_vec_atom(atomGroupsPtr, cgsort, state->x, rvecBuffer.buffer);
      }
      if (state->flags & (1 << estV))
      {
-        order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->v.data()), vbuf);
+        order_vec_atom(atomGroupsPtr, cgsort, state->v, rvecBuffer.buffer);
      }
      if (state->flags & (1 << estCGP))
      {
-        order_vec_atom(dd->ncg_home, cgindex, cgsort, as_rvec_array(state->cg_p.data()), vbuf);
+        order_vec_atom(atomGroupsPtr, cgsort, state->cg_p, rvecBuffer.buffer);
      }
  
      if (fr->cutoff_scheme == ecutsGROUP)
      {
          /* Reorder cgcm */
-        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
+        gmx::ArrayRef<gmx::RVec> cgcmRef = gmx::arrayRefFromArray(reinterpret_cast<gmx::RVec *>(cgcm[0]), cgsort.size());
+        orderVector(cgsort, cgcmRef, rvecBuffer.buffer);
      }
  
-    if (dd->ncg_home+1 > sort->ibuf_nalloc)
-    {
-        sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
-        srenew(sort->ibuf, sort->ibuf_nalloc);
-    }
-    ibuf = sort->ibuf;
      /* Reorder the global cg index */
-    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
+    orderVector<int>(cgsort, dd->globalAtomGroupIndices, &sort->intBuffer);
      /* Reorder the cginfo */
-    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
+    orderVector<int>(cgsort, gmx::arrayRefFromArray(fr->cginfo, cgsort.size()), &sort->intBuffer);
      /* Rebuild the local cg index */
      if (dd->comm->bCGs)
      {
-        ibuf[0] = 0;
-        for (i = 0; i < dd->ncg_home; i++)
-        {
-            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
-            ibuf[i+1] = ibuf[i] + cgsize;
-        }
-        for (i = 0; i < dd->ncg_home+1; i++)
+        /* We make a new, ordered atomGroups object and assign it to
+         * the old one. This causes some allocation overhead, but saves
+         * a copy back of the whole index.
+         */
+        gmx::RangePartitioning ordered;
+        for (const gmx_cgsort_t &entry : cgsort)
          {
-            dd->cgindex[i] = ibuf[i];
+            ordered.appendBlock(atomGrouping.block(entry.ind).size());
          }
+        dd->atomGrouping_ = ordered;
      }
      else
      {
-        for (i = 0; i < dd->ncg_home+1; i++)
-        {
-            dd->cgindex[i] = i;
-        }
+        dd->atomGrouping_.setAllBlocksSizeOne(dd->ncg_home);
      }
      /* Set the home atom number */
-    dd->nat_home = dd->cgindex[dd->ncg_home];
+    dd->comm->atomRanges.setEnd(DDAtomRanges::Type::Home, dd->atomGrouping().fullRange().end());
  
      if (fr->cutoff_scheme == ecutsVERLET)
      {
          /* The atoms are now exactly in grid order, update the grid order */
-        nbnxn_set_atomorder(fr->nbv->nbs);
+        nbnxn_set_atomorder(fr->nbv->nbs.get());
      }
      else
      {
          /* Copy the sorted ns cell indices back to the ns grid struct */
-        for (i = 0; i < dd->ncg_home; i++)
+        for (gmx::index i = 0; i < cgsort.size(); i++)
          {
              fr->ns->grid->cell_index[i] = cgsort[i].nsc;
          }
-        fr->ns->grid->nr = dd->ncg_home;
+        fr->ns->grid->nr = cgsort.size();
      }
  }
  
  static void add_dd_statistics(gmx_domdec_t *dd)
  {
-    gmx_domdec_comm_t *comm;
-    int                ddnat;
-
-    comm = dd->comm;
+    gmx_domdec_comm_t *comm = dd->comm;
  
-    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
+    for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
      {
-        comm->sum_nat[ddnat-ddnatZONE] +=
-            comm->nat[ddnat] - comm->nat[ddnat-1];
+        auto range = static_cast<DDAtomRanges::Type>(i);
+        comm->sum_nat[i] +=
+            comm->atomRanges.end(range) - comm->atomRanges.start(range);
      }
      comm->ndecomp++;
  }
  
  void reset_dd_statistics_counters(gmx_domdec_t *dd)
  {
-    gmx_domdec_comm_t *comm;
-    int                ddnat;
-
-    comm = dd->comm;
+    gmx_domdec_comm_t *comm = dd->comm;
  
      /* Reset all the statistics and counters for total run counting */
-    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
+    for (int i = 0; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
      {
-        comm->sum_nat[ddnat-ddnatZONE] = 0;
+        comm->sum_nat[i] = 0;
      }
      comm->ndecomp   = 0;
      comm->nload     = 0;
@@ -9147,15 +6175,12 @@ void reset_dd_statistics_counters(gmx_domdec_t *dd)
      comm->load_pme = 0;
  }
  
-void print_dd_statistics(t_commrec *cr, const t_inputrec *ir, FILE *fplog)
+void print_dd_statistics(const t_commrec *cr, const t_inputrec *ir, FILE *fplog)
  {
-    gmx_domdec_comm_t *comm;
-    int                ddnat;
-    double             av;
-
-    comm = cr->dd->comm;
+    gmx_domdec_comm_t *comm      = cr->dd->comm;
  
-    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
+    const int          numRanges = static_cast<int>(DDAtomRanges::Type::Number);
+    gmx_sumd(numRanges, comm->sum_nat, cr);
  
      if (fplog == nullptr)
      {
@@ -9164,17 +6189,18 @@ void print_dd_statistics(t_commrec *cr, const t_inputrec *ir, FILE *fplog)
  
      fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
  
-    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
+    for (int i = static_cast<int>(DDAtomRanges::Type::Zones); i < numRanges; i++)
      {
-        av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
-        switch (ddnat)
+        auto   range = static_cast<DDAtomRanges::Type>(i);
+        double av    = comm->sum_nat[i]/comm->ndecomp;
+        switch (range)
          {
-            case ddnatZONE:
+            case DDAtomRanges::Type::Zones:
                  fprintf(fplog,
                          " av. #atoms communicated per step for force:  %d x %.1f\n",
                          2, av);
                  break;
-            case ddnatVSITE:
+            case DDAtomRanges::Type::Vsites:
                  if (cr->dd->vsite_comm)
                  {
                      fprintf(fplog,
@@ -9183,7 +6209,7 @@ void print_dd_statistics(t_commrec *cr, const t_inputrec *ir, FILE *fplog)
                              av);
                  }
                  break;
-            case ddnatCON:
+            case DDAtomRanges::Type::Constraints:
                  if (cr->dd->constraint_comm)
                  {
                      fprintf(fplog,
@@ -9204,8 +6230,8 @@ void print_dd_statistics(t_commrec *cr, const t_inputrec *ir, FILE *fplog)
  }
  
  void dd_partition_system(FILE                *fplog,
-                         gmx_int64_t          step,
-                         t_commrec           *cr,
+                         int64_t              step,
+                         const t_commrec     *cr,
                           gmx_bool             bMasterState,
                           int                  nstglobalcomm,
                           t_state             *state_global,
@@ -9217,20 +6243,20 @@ void dd_partition_system(FILE                *fplog,
                           gmx_localtop_t      *top_local,
                           t_forcerec          *fr,
                           gmx_vsite_t         *vsite,
-                         gmx_constr_t         constr,
+                         gmx::Constraints    *constr,
                           t_nrnb              *nrnb,
-                         gmx_wallcycle_t      wcycle,
+                         gmx_wallcycle       *wcycle,
                           gmx_bool             bVerbose)
  {
      gmx_domdec_t      *dd;
      gmx_domdec_comm_t *comm;
      gmx_ddbox_t        ddbox = {0};
      t_block           *cgs_gl;
-    gmx_int64_t        step_pcoupl;
+    int64_t            step_pcoupl;
      rvec               cell_ns_x0, cell_ns_x1;
-    int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
+    int                ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
      gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckWhetherToTurnDlbOn, bLogLoad;
-    gmx_bool           bRedist, bSortCG, bResortAll;
+    gmx_bool           bRedist, bResortAll;
      ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
      real               grid_density;
      char               sbuf[22];
@@ -9240,6 +6266,8 @@ void dd_partition_system(FILE                *fplog,
      dd   = cr->dd;
      comm = dd->comm;
  
+    // TODO if the update code becomes accessible here, use
+    // upd->deform for this logic.
      bBoxChanged = (bMasterState || inputrecDeform(ir));
      if (ir->epc != epcNO)
      {
@@ -9252,7 +6280,7 @@ void dd_partition_system(FILE                *fplog,
           * We need to determine the last step in which p-coupling occurred.
           * MRS -- need to validate this for vv?
           */
-        n = ir->nstpcouple;
+        int n = ir->nstpcouple;
          if (n == 1)
          {
              step_pcoupl = step - 1;
@@ -9438,22 +6466,17 @@ void dd_partition_system(FILE                *fplog,
      if (bMasterState)
      {
          /* Clear the old state */
-        clear_dd_indices(dd, 0, 0);
+        clearDDStateIndices(dd, 0, 0);
          ncgindex_set = 0;
  
-        rvec *xGlobal = (SIMMASTER(cr) ? as_rvec_array(state_global->x.data()) : nullptr);
+        auto xGlobal = positionsFromStatePointer(state_global);
  
-        set_ddbox(dd, bMasterState, cr, ir,
-                  SIMMASTER(cr) ? state_global->box : nullptr,
-                  TRUE, cgs_gl, xGlobal,
+        set_ddbox(dd, true, ir,
+                  DDMASTER(dd) ? state_global->box : nullptr,
+                  true, xGlobal,
                    &ddbox);
  
-        get_cg_distribution(fplog, dd, cgs_gl,
-                            SIMMASTER(cr) ? state_global->box : nullptr,
-                            &ddbox, xGlobal);
-
-        dd_distribute_state(dd, cgs_gl,
-                            state_global, state_local, f);
+        distributeState(fplog, dd, state_global, ddbox, state_local, f);
  
          dd_make_local_cgs(dd, &top_local->cgs);
  
@@ -9466,15 +6489,15 @@ void dd_partition_system(FILE                *fplog,
                        &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
          }
  
-        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
+        inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
  
-        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
+        dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
      }
      else if (state_local->ddp_count != dd->ddp_count)
      {
          if (state_local->ddp_count > dd->ddp_count)
          {
-            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
+            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%ld)", state_local->ddp_count, dd->ddp_count);
          }
  
          if (state_local->ddp_count_cg_gl != state_local->ddp_count)
@@ -9483,10 +6506,10 @@ void dd_partition_system(FILE                *fplog,
          }
  
          /* Clear the old state */
-        clear_dd_indices(dd, 0, 0);
+        clearDDStateIndices(dd, 0, 0);
  
-        /* Build the new indices */
-        rebuild_cgindex(dd, cgs_gl->index, state_local);
+        /* Restore the atom group indices from state_local */
+        restoreAtomGroups(dd, cgs_gl->index, state_local);
          make_dd_indices(dd, cgs_gl->index, 0);
          ncgindex_set = dd->ncg_home;
  
@@ -9497,12 +6520,12 @@ void dd_partition_system(FILE                *fplog,
                        &top_local->cgs, as_rvec_array(state_local->x.data()), fr->cg_cm);
          }
  
-        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
+        inc_nrnb(nrnb, eNR_CGCM, comm->atomRanges.numHomeAtoms());
  
-        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
+        dd_set_cginfo(dd->globalAtomGroupIndices, 0, dd->ncg_home, fr, comm->bLocalCG);
  
-        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
-                  TRUE, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
+        set_ddbox(dd, bMasterState, ir, state_local->box,
+                  true, state_local->x, &ddbox);
  
          bRedist = isDlbOn(comm);
      }
@@ -9511,7 +6534,7 @@ void dd_partition_system(FILE                *fplog,
          /* We have the full state, only redistribute the cgs */
  
          /* Clear the non-home indices */
-        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
+        clearDDStateIndices(dd, dd->ncg_home, comm->atomRanges.numHomeAtoms());
          ncgindex_set = 0;
  
          /* Avoid global communication for dim's without pbc and -gcom */
@@ -9520,8 +6543,8 @@ void dd_partition_system(FILE                *fplog,
              copy_rvec(comm->box0, ddbox.box0    );
              copy_rvec(comm->box_size, ddbox.box_size);
          }
-        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
-                  bNStGlobalComm, &top_local->cgs, as_rvec_array(state_local->x.data()), &ddbox);
+        set_ddbox(dd, bMasterState, ir, state_local->box,
+                  bNStGlobalComm, state_local->x, &ddbox);
  
          bBoxChanged = TRUE;
          bRedist     = TRUE;
@@ -9539,7 +6562,7 @@ void dd_partition_system(FILE                *fplog,
      }
  
      /* Check if we should sort the charge groups */
-    bSortCG = (bMasterState || bRedist);
+    const bool bSortCG = (bMasterState || bRedist);
  
      ncg_home_old = dd->ncg_home;
  
@@ -9553,9 +6576,12 @@ void dd_partition_system(FILE                *fplog,
      {
          wallcycle_sub_start(wcycle, ewcsDD_REDIST);
  
+        ncgindex_set = dd->ncg_home;
          dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
                             state_local, f, fr,
-                           !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
+                           nrnb, &ncg_moved);
+
+        GMX_RELEASE_ASSERT(bSortCG, "Sorting is required after redistribution");
  
          wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
      }
@@ -9580,7 +6606,7 @@ void dd_partition_system(FILE                *fplog,
                         fr->rlist, grid_density);
              break;
          case ecutsVERLET:
-            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
+            nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_old[XX], &ncells_old[YY]);
              break;
          default:
              gmx_incons("unimplemented");
@@ -9608,7 +6634,7 @@ void dd_partition_system(FILE                *fplog,
              case ecutsVERLET:
                  set_zones_size(dd, state_local->box, &ddbox, 0, 1, ncg_moved);
  
-                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
+                nbnxn_put_on_grid(fr->nbv->nbs.get(), fr->ePBC, state_local->box,
                                    0,
                                    comm->zones.size[0].bb_x0,
                                    comm->zones.size[0].bb_x1,
@@ -9616,11 +6642,11 @@ void dd_partition_system(FILE                *fplog,
                                    comm->zones.dens_zone0,
                                    fr->cginfo,
                                    as_rvec_array(state_local->x.data()),
-                                  ncg_moved, bRedist ? comm->moved : nullptr,
+                                  ncg_moved, bRedist ? comm->movedBuffer.data() : nullptr,
                                    fr->nbv->grp[eintLocal].kernel_type,
                                    fr->nbv->nbat);
  
-                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
+                nbnxn_get_ncells(fr->nbv->nbs.get(), &ncells_new[XX], &ncells_new[YY]);
                  break;
              case ecutsGROUP:
                  fill_grid(&comm->zones, fr->ns->grid, dd->ncg_home,
@@ -9653,10 +6679,10 @@ void dd_partition_system(FILE                *fplog,
                        bResortAll ? -1 : ncg_home_old);
  
          /* After sorting and compacting we set the correct size */
-        dd_resize_state(state_local, f, dd->nat_home);
+        dd_resize_state(state_local, f, comm->atomRanges.numHomeAtoms());
  
          /* Rebuild all the indices */
-        ga2la_clear(dd->ga2la);
+        dd->ga2la->clear();
          ncgindex_set = 0;
  
          wallcycle_sub_stop(wcycle, ewcsDD_GRID);
@@ -9675,6 +6701,7 @@ void dd_partition_system(FILE                *fplog,
  
      if (fr->cutoff_scheme == ecutsVERLET)
      {
+        /* When bSortCG=true, we have already set the size for zone 0 */
          set_zones_size(dd, state_local->box, &ddbox,
                         bSortCG ? 1 : 0, comm->zones.n,
                         0);
@@ -9690,9 +6717,9 @@ void dd_partition_system(FILE                *fplog,
      wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
  
      /* Extract a local topology from the global topology */
-    for (i = 0; i < dd->ndim; i++)
+    for (int i = 0; i < dd->ndim; i++)
      {
-        np[dd->dim[i]] = comm->cd[i].np;
+        np[dd->dim[i]] = comm->cd[i].numPulses();
      }
      dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
                        comm->cellsize_min, np,
@@ -9705,18 +6732,19 @@ void dd_partition_system(FILE                *fplog,
      wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
  
      /* Set up the special atom communication */
-    n = comm->nat[ddnatZONE];
-    for (i = ddnatZONE+1; i < ddnatNR; i++)
+    int n = comm->atomRanges.end(DDAtomRanges::Type::Zones);
+    for (int i = static_cast<int>(DDAtomRanges::Type::Zones) + 1; i < static_cast<int>(DDAtomRanges::Type::Number); i++)
      {
-        switch (i)
+        auto range = static_cast<DDAtomRanges::Type>(i);
+        switch (range)
          {
-            case ddnatVSITE:
+            case DDAtomRanges::Type::Vsites:
                  if (vsite && vsite->n_intercg_vsite)
                  {
                      n = dd_make_local_vsites(dd, n, top_local->idef.il);
                  }
                  break;
-            case ddnatCON:
+            case DDAtomRanges::Type::Constraints:
                  if (dd->bInterCGcons || dd->bInterCGsettles)
                  {
                      /* Only for inter-cg constraints we need special code */
@@ -9728,7 +6756,7 @@ void dd_partition_system(FILE                *fplog,
              default:
                  gmx_incons("Unknown special atom type setup");
          }
-        comm->nat[i] = n;
+        comm->atomRanges.setEnd(range, n);
      }
  
      wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
@@ -9738,7 +6766,7 @@ void dd_partition_system(FILE                *fplog,
      /* Make space for the extra coordinates for virtual site
       * or constraint communication.
       */
-    state_local->natoms = comm->nat[ddnatNR-1];
+    state_local->natoms = comm->atomRanges.numAtomsTotal();
  
      dd_resize_state(state_local, f, state_local->natoms);
  
@@ -9746,17 +6774,17 @@ void dd_partition_system(FILE                *fplog,
      {
          if (vsite && vsite->n_intercg_vsite)
          {
-            nat_f_novirsum = comm->nat[ddnatVSITE];
+            nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Vsites);
          }
          else
          {
              if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
              {
-                nat_f_novirsum = dd->nat_tot;
+                nat_f_novirsum = comm->atomRanges.end(DDAtomRanges::Type::Zones);
              }
              else
              {
-                nat_f_novirsum = dd->nat_home;
+                nat_f_novirsum = comm->atomRanges.numHomeAtoms();
              }
          }
      }
@@ -9771,17 +6799,14 @@ void dd_partition_system(FILE                *fplog,
       * allocation, zeroing and copying, but this is probably not worth
       * the complications and checking.
       */
-    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
-                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
+    forcerec_set_ranges(fr, dd->ncg_home, dd->globalAtomGroupIndices.size(),
+                        comm->atomRanges.end(DDAtomRanges::Type::Zones),
+                        comm->atomRanges.end(DDAtomRanges::Type::Constraints),
+                        nat_f_novirsum);
  
      /* Update atom data for mdatoms and several algorithms */
      mdAlgorithmsSetupAtomData(cr, ir, top_global, top_local, fr,
-                              nullptr, mdAtoms, vsite, nullptr);
-
-    if (ir->implicit_solvent)
-    {
-        make_local_gb(cr, fr->born, ir->gb_algorithm);
-    }
+                              nullptr, mdAtoms, constr, vsite, nullptr);
  
      auto mdatoms = mdAtoms->mdatoms();
      if (!thisRankHasDuty(cr, DUTY_PME))
@@ -9789,34 +6814,23 @@ void dd_partition_system(FILE                *fplog,
          /* Send the charges and/or c6/sigmas to our PME only node */
          gmx_pme_send_parameters(cr,
                                  fr->ic,
-                                mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
+                                mdatoms->nChargePerturbed != 0, mdatoms->nTypePerturbed != 0,
                                  mdatoms->chargeA, mdatoms->chargeB,
                                  mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
                                  mdatoms->sigmaA, mdatoms->sigmaB,
                                  dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
      }
  
-    if (constr)
-    {
-        set_constraints(constr, top_local, ir, mdatoms, cr);
-    }
-
      if (ir->bPull)
      {
          /* Update the local pull groups */
-        dd_make_local_pull_groups(cr, ir->pull_work, mdatoms);
-    }
-
-    if (ir->bRot)
-    {
-        /* Update the local rotation groups */
-        dd_make_local_rotation_groups(dd, ir->rot);
+        dd_make_local_pull_groups(cr, ir->pull_work);
      }
  
-    if (ir->eSwapCoords != eswapNO)
+    if (dd->atomSets != nullptr)
      {
-        /* Update the local groups needed for ion swapping */
-        dd_make_local_swap_groups(dd, ir->swap);
+        /* Update the local atom sets */
+        dd->atomSets->setIndicesInDomainDecomposition(*(dd->ga2la));
      }
  
      /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
@@ -9837,7 +6851,7 @@ void dd_partition_system(FILE                *fplog,
  
      if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
      {
-        dd_move_x(dd, state_local->box, as_rvec_array(state_local->x.data()));
+        dd_move_x(dd, state_local->box, state_local->x, nullWallcycle);
          write_dd_pdb("dd_dump", step, "dump", top_global, cr,
                       -1, as_rvec_array(state_local->x.data()), state_local->box);
      }
@@ -9866,3 +6880,22 @@ void dd_partition_system(FILE                *fplog,
  
      wallcycle_stop(wcycle, ewcDOMDEC);
  }
+
+/*! \brief Check whether bonded interactions are missing, if appropriate */
+void checkNumberOfBondedInteractions(FILE                 *fplog,
+                                     t_commrec            *cr,
+                                     int                   totalNumberOfBondedInteractions,
+                                     const gmx_mtop_t     *top_global,
+                                     const gmx_localtop_t *top_local,
+                                     const t_state        *state,
+                                     bool                 *shouldCheckNumberOfBondedInteractions)
+{
+    if (*shouldCheckNumberOfBondedInteractions)
+    {
+        if (totalNumberOfBondedInteractions != cr->dd->nbonded_global)
+        {
+            dd_print_missing_interactions(fplog, cr, totalNumberOfBondedInteractions, top_global, top_local, state); // Does not return
+        }
+        *shouldCheckNumberOfBondedInteractions = false;
+    }
+}