Domain decomposition now checks the rlist buffer
[alexxy/gromacs.git] / src / gromacs / mdlib / domdec.c
index 80c4b5c14c29e24244462c4da999dc461e2585e6..ca2514a5dd4df5bd133c842e9890aa7c54a4f297 100644 (file)
@@ -1,19 +1,36 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
  *
- * 
- * This file is part of Gromacs        Copyright (c) 1991-2008
- * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
+ * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
  *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
  * of the License, or (at your option) any later version.
  *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
  * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org
- * 
- * And Hey:
- * Gnomes, ROck Monsters And Chili Sauce
+ * the research papers on the package. Check out http://www.gromacs.org.
  */
 
 #ifdef HAVE_CONFIG_H
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
+#include <assert.h>
+
 #include "typedefs.h"
-#include "smalloc.h"
+#include "gromacs/utility/smalloc.h"
 #include "gmx_fatal.h"
 #include "gmx_fatal_collective.h"
 #include "vec.h"
 #include "constr.h"
 #include "mdatoms.h"
 #include "names.h"
-#include "pdbio.h"
-#include "futil.h"
 #include "force.h"
 #include "pme.h"
-#include "pull.h"
-#include "pull_rotation.h"
-#include "gmx_wallcycle.h"
 #include "mdrun.h"
 #include "nsgrid.h"
 #include "shellfc.h"
 #include "mtop_util.h"
-#include "gmxfio.h"
 #include "gmx_ga2la.h"
-#include "gmx_sort.h"
 #include "macros.h"
 #include "nbnxn_search.h"
 #include "bondf.h"
 #include "gmx_omp_nthreads.h"
-
-#ifdef GMX_LIB_MPI
-#include <mpi.h>
-#endif
-#ifdef GMX_THREAD_MPI
-#include "tmpi.h"
-#endif
-
-#define DDRANK(dd,rank)    (rank)
+#include "gpu_utils.h"
+
+#include "gromacs/fileio/futil.h"
+#include "gromacs/fileio/gmxfio.h"
+#include "gromacs/fileio/pdbio.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/swap/swapcoords.h"
+#include "gromacs/utility/qsort_threadsafe.h"
+#include "gromacs/pulling/pull.h"
+#include "gromacs/pulling/pull_rotation.h"
+#include "gromacs/imd/imd.h"
+
+#define DDRANK(dd, rank)    (rank)
 #define DDMASTERRANK(dd)   (dd->masterrank)
 
 typedef struct gmx_domdec_master
@@ -72,12 +89,12 @@ typedef struct gmx_domdec_master
     /* The cell boundaries */
     real **cell_x;
     /* The global charge group division */
-    int  *ncg;     /* Number of home charge groups for each node */
-    int  *index;   /* Index of nnodes+1 into cg */
-    int  *cg;      /* Global charge group index */
-    int  *nat;     /* Number of home atoms for each node. */
-    int  *ibuf;    /* Buffer for communication */
-    rvec *vbuf;    /* Buffer for state scattering and gathering */
+    int   *ncg;    /* Number of home charge groups for each node */
+    int   *index;  /* Index of nnodes+1 into cg */
+    int   *cg;     /* Global charge group index */
+    int   *nat;    /* Number of home atoms for each node. */
+    int   *ibuf;   /* Buffer for communication */
+    rvec  *vbuf;   /* Buffer for state scattering and gathering */
 } gmx_domdec_master_t;
 
 typedef struct
@@ -86,36 +103,36 @@ typedef struct
      * that requires communication, the last entry contains the total
      * number of atoms that needs to be communicated.
      */
-    int nsend[DD_MAXIZONE+2];
-    int nrecv[DD_MAXIZONE+2];
+    int  nsend[DD_MAXIZONE+2];
+    int  nrecv[DD_MAXIZONE+2];
     /* The charge groups to send */
     int *index;
-    int nalloc;
+    int  nalloc;
     /* The atom range for non-in-place communication */
-    int cell2at0[DD_MAXIZONE];
-    int cell2at1[DD_MAXIZONE];
+    int  cell2at0[DD_MAXIZONE];
+    int  cell2at1[DD_MAXIZONE];
 } gmx_domdec_ind_t;
 
 typedef struct
 {
-    int  np;                   /* Number of grid pulses in this dimension */
-    int  np_dlb;               /* For dlb, for use with edlbAUTO          */
-    gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
-    int  np_nalloc;
-    gmx_bool bInPlace;             /* Can we communicate in place?            */
+    int               np;       /* Number of grid pulses in this dimension */
+    int               np_dlb;   /* For dlb, for use with edlbAUTO          */
+    gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
+    int               np_nalloc;
+    gmx_bool          bInPlace; /* Can we communicate in place?            */
 } gmx_domdec_comm_dim_t;
 
 typedef struct
 {
     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
-    real *cell_f;      /* State var.: cell boundaries, box relative      */
-    real *old_cell_f;  /* Temp. var.: old cell size                      */
-    real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
-    real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
-    real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
-    real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
-    gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
-    real *buf_ncd;     /* Temp. var.                                     */
+    real     *cell_f;      /* State var.: cell boundaries, box relative      */
+    real     *old_cell_f;  /* Temp. var.: old cell size                      */
+    real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
+    real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
+    real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
+    real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
+    gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
+    real     *buf_ncd;     /* Temp. var.                                     */
 } gmx_domdec_root_t;
 
 #define DD_NLOAD_MAX 9
@@ -125,15 +142,15 @@ typedef struct
  */
 typedef struct
 {
-    int  nload;
+    int    nload;
     float *load;
-    float sum;
-    float max;
-    float sum_m;
-    float cvol_min;
-    float mdf;
-    float pme;
-    int   flags;
+    float  sum;
+    float  max;
+    float  sum_m;
+    float  cvol_min;
+    float  mdf;
+    float  pme;
+    int    flags;
 } gmx_domdec_load_t;
 
 typedef struct
@@ -147,37 +164,41 @@ typedef struct
 {
     gmx_cgsort_t *sort;
     gmx_cgsort_t *sort2;
-    int  sort_nalloc;
+    int           sort_nalloc;
     gmx_cgsort_t *sort_new;
-    int  sort_new_nalloc;
-    int  *ibuf;
-    int  ibuf_nalloc;
+    int           sort_new_nalloc;
+    int          *ibuf;
+    int           ibuf_nalloc;
 } gmx_domdec_sort_t;
 
 typedef struct
 {
     rvec *v;
-    int  nalloc;
+    int   nalloc;
 } vec_rvec_t;
 
 /* This enum determines the order of the coordinates.
  * ddnatHOME and ddnatZONE should be first and second,
  * the others can be ordered as wanted.
  */
-enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
+enum {
+    ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
+};
 
-enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
+enum {
+    edlbAUTO, edlbNO, edlbYES, edlbNR
+};
 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 
 typedef struct
 {
-    int  dim;      /* The dimension                                          */
-    gmx_bool dim_match;/* Tells if DD and PME dims match                         */
-    int  nslab;    /* The number of PME slabs in this dimension              */
-    real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
-    int  *pp_min;  /* The minimum pp node location, size nslab               */
-    int  *pp_max;  /* The maximum pp node location,size nslab                */
-    int  maxshift; /* The maximum shift for coordinate redistribution in PME */
+    int      dim;       /* The dimension                                          */
+    gmx_bool dim_match; /* Tells if DD and PME dims match                         */
+    int      nslab;     /* The number of PME slabs in this dimension              */
+    real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
+    int     *pp_min;    /* The minimum pp node location, size nslab               */
+    int     *pp_max;    /* The maximum pp node location,size nslab                */
+    int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 } gmx_ddpme_t;
 
 typedef struct
@@ -194,12 +215,12 @@ typedef struct
 typedef struct
 {
     gmx_domdec_ind_t ind;
-    int *ibuf;
-    int ibuf_nalloc;
-    vec_rvec_t vbuf;
-    int nsend;
-    int nat;
-    int nsend_zone;
+    int             *ibuf;
+    int              ibuf_nalloc;
+    vec_rvec_t       vbuf;
+    int              nsend;
+    int              nat;
+    int              nsend_zone;
 } dd_comm_setup_work_t;
 
 typedef struct gmx_domdec_comm
@@ -209,31 +230,31 @@ typedef struct gmx_domdec_comm
      */
 
     /* The number of decomposition dimensions for PME, 0: no PME */
-    int  npmedecompdim;
+    int         npmedecompdim;
     /* The number of nodes doing PME (PP/PME or only PME) */
-    int  npmenodes;
-    int  npmenodes_x;
-    int  npmenodes_y;
+    int         npmenodes;
+    int         npmenodes_x;
+    int         npmenodes_y;
     /* The communication setup including the PME only nodes */
-    gmx_bool bCartesianPP_PME;
-    ivec ntot;
-    int  cartpmedim;
-    int  *pmenodes;          /* size npmenodes                         */
-    int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
-                              * but with bCartesianPP_PME              */
+    gmx_bool    bCartesianPP_PME;
+    ivec        ntot;
+    int         cartpmedim;
+    int        *pmenodes;          /* size npmenodes                         */
+    int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
+                                    * but with bCartesianPP_PME              */
     gmx_ddpme_t ddpme[2];
-    
+
     /* The DD particle-particle nodes only */
     gmx_bool bCartesianPP;
-    int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
-    
+    int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
+
     /* The global charge groups */
     t_block cgs_gl;
 
     /* Should we sort the cgs */
-    int  nstSortCG;
+    int                nstSortCG;
     gmx_domdec_sort_t *sort;
-    
+
     /* Are there charge groups? */
     gmx_bool bCGs;
 
@@ -242,36 +263,43 @@ typedef struct gmx_domdec_comm
     gmx_bool bInterCGMultiBody;
 
     /* Data for the optional bonded interaction atom communication range */
-    gmx_bool bBondComm;
+    gmx_bool  bBondComm;
     t_blocka *cglink;
-    char *bLocalCG;
+    char     *bLocalCG;
 
     /* The DLB option */
-    int  eDLB;
+    int      eDLB;
     /* Are we actually using DLB? */
     gmx_bool bDynLoadBal;
 
     /* Cell sizes for static load balancing, first index cartesian */
     real **slb_frac;
-    
+
     /* The width of the communicated boundaries */
-    real cutoff_mbody;
-    real cutoff;
+    real     cutoff_mbody;
+    real     cutoff;
     /* The minimum cell size (including triclinic correction) */
-    rvec cellsize_min;
+    rvec     cellsize_min;
     /* For dlb, for use with edlbAUTO */
-    rvec cellsize_min_dlb;
+    rvec     cellsize_min_dlb;
     /* The lower limit for the DD cell size with DLB */
-    real cellsize_limit;
+    real     cellsize_limit;
     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
     gmx_bool bVacDLBNoLimit;
 
+    /* With PME load balancing we set limits on DLB */
+    gmx_bool bPMELoadBalDLBLimits;
+    /* DLB needs to take into account that we want to allow this maximum
+     * cut-off (for PME load balancing), this could limit cell boundaries.
+     */
+    real PMELoadBal_max_cutoff;
+
     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
     ivec tric_dir;
     /* box0 and box_size are required with dim's without pbc and -gcom */
     rvec box0;
     rvec box_size;
-    
+
     /* The cell boundaries */
     rvec cell_x0;
     rvec cell_x1;
@@ -282,84 +310,86 @@ typedef struct gmx_domdec_comm
 
     /* The communication setup and charge group boundaries for the zones */
     gmx_domdec_zones_t zones;
-    
+
     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
      * cell boundaries of neighboring cells for dynamic load balancing.
      */
     gmx_ddzone_t zone_d1[2];
     gmx_ddzone_t zone_d2[2][2];
-    
+
     /* The coordinate/force communication setup and indices */
     gmx_domdec_comm_dim_t cd[DIM];
     /* The maximum number of cells to communicate with in one dimension */
-    int  maxpulse;
-    
+    int                   maxpulse;
+
     /* Which cg distribution is stored on the master node */
     int master_cg_ddp_count;
-    
+
     /* The number of cg's received from the direct neighbors */
     int  zone_ncg1[DD_MAXZONE];
-    
+
     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
     int  nat[ddnatNR];
 
     /* Array for signalling if atoms have moved to another domain */
     int  *moved;
-    int  moved_nalloc;
-    
+    int   moved_nalloc;
+
     /* Communication buffer for general use */
     int  *buf_int;
-    int  nalloc_int;
+    int   nalloc_int;
 
     /* Communication buffer for general use */
     vec_rvec_t vbuf;
 
     /* Temporary storage for thread parallel communication setup */
-    int nth;
+    int                   nth;
     dd_comm_setup_work_t *dth;
 
     /* Communication buffers only used with multiple grid pulses */
-    int  *buf_int2;
-    int  nalloc_int2;
+    int       *buf_int2;
+    int        nalloc_int2;
     vec_rvec_t vbuf2;
-    
+
     /* Communication buffers for local redistribution */
     int  **cggl_flag;
-    int  cggl_flag_nalloc[DIM*2];
+    int    cggl_flag_nalloc[DIM*2];
     rvec **cgcm_state;
-    int  cgcm_state_nalloc[DIM*2];
-    
+    int    cgcm_state_nalloc[DIM*2];
+
     /* Cell sizes for dynamic load balancing */
     gmx_domdec_root_t **root;
-    real *cell_f_row;
-    real cell_f0[DIM];
-    real cell_f1[DIM];
-    real cell_f_max0[DIM];
-    real cell_f_min1[DIM];
-    
+    real               *cell_f_row;
+    real                cell_f0[DIM];
+    real                cell_f1[DIM];
+    real                cell_f_max0[DIM];
+    real                cell_f_min1[DIM];
+
     /* Stuff for load communication */
-    gmx_bool bRecordLoad;
+    gmx_bool           bRecordLoad;
     gmx_domdec_load_t *load;
+    int                nrank_gpu_shared;
 #ifdef GMX_MPI
-    MPI_Comm *mpi_comm_load;
+    MPI_Comm          *mpi_comm_load;
+    MPI_Comm           mpi_comm_gpu_shared;
 #endif
 
     /* Maximum DLB scaling per load balancing step in percent */
     int dlb_scale_lim;
 
     /* Cycle counters */
-    float cycl[ddCyclNr];
-    int   cycl_n[ddCyclNr];
-    float cycl_max[ddCyclNr];
+    float  cycl[ddCyclNr];
+    int    cycl_n[ddCyclNr];
+    float  cycl_max[ddCyclNr];
     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
-    int eFlop;
+    int    eFlop;
     double flop;
     int    flop_n;
     /* Have often have did we have load measurements */
     int    n_load_have;
     /* Have often have we collected the load measurements */
     int    n_load_collect;
-    
+
     /* Statistics */
     double sum_nat[ddnatNR-ddnatZONE];
     int    ndecomp;
@@ -372,7 +402,7 @@ typedef struct gmx_domdec_comm
     double load_pme;
 
     /* The last partition step */
-    gmx_large_int_t partition_step;
+    gmx_int64_t partition_step;
 
     /* Debugging */
     int  nstDDDump;
@@ -391,7 +421,7 @@ typedef struct gmx_domdec_comm
 /* Zone permutation required to obtain consecutive charge groups
  * for neighbor searching.
  */
-static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
+static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 
 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
  * components see only j zones with that component 0.
@@ -399,22 +429,22 @@ static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 
 /* The DD zone order */
 static const ivec dd_zo[DD_MAXZONE] =
-  {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
+{{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 
 /* The 3D setup */
 #define dd_z3n  8
 #define dd_zp3n 4
-static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
+static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 
 /* The 2D setup */
 #define dd_z2n  4
 #define dd_zp2n 2
-static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
+static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 
 /* The 1D setup */
 #define dd_z1n  2
 #define dd_zp1n 1
-static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
+static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 
 /* Factors used to avoid problems due to rounding issues */
 #define DD_CELL_MARGIN       1.0001
@@ -422,10 +452,16 @@ static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 /* Factor to account for pressure scaling during nstlist steps */
 #define DD_PRES_SCALE_MARGIN 1.02
 
-/* Allowed performance loss before we DLB or warn */
-#define DD_PERF_LOSS 0.05
+/* Turn on DLB when the load imbalance causes this amount of total loss.
+ * There is a bit of overhead with DLB and it's difficult to achieve
+ * a load imbalance of less than 2% with DLB.
+ */
+#define DD_PERF_LOSS_DLB_ON  0.02
+
+/* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
+#define DD_PERF_LOSS_WARN    0.05
 
-#define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
+#define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 
 /* Use separate MPI send and receive commands
  * when nnodes <= GMX_DD_NNODES_SENDRECV.
@@ -436,34 +472,34 @@ static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 
 
 /*
-#define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
-
-static void index2xyz(ivec nc,int ind,ivec xyz)
-{
-  xyz[XX] = ind % nc[XX];
-  xyz[YY] = (ind / nc[XX]) % nc[YY];
-  xyz[ZZ] = ind / (nc[YY]*nc[XX]);
-}
-*/
+   #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
+
+   static void index2xyz(ivec nc,int ind,ivec xyz)
+   {
+   xyz[XX] = ind % nc[XX];
+   xyz[YY] = (ind / nc[XX]) % nc[YY];
+   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
+   }
+ */
 
 /* This order is required to minimize the coordinate communication in PME
  * which uses decomposition in the x direction.
  */
-#define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
+#define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 
-static void ddindex2xyz(ivec nc,int ind,ivec xyz)
+static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 {
     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
     xyz[ZZ] = ind % nc[ZZ];
 }
 
-static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
+static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 {
     int ddindex;
-    int ddnodeid=-1;
-    
-    ddindex = dd_index(dd->nc,c);
+    int ddnodeid = -1;
+
+    ddindex = dd_index(dd->nc, c);
     if (dd->comm->bCartesianPP_PME)
     {
         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
@@ -471,26 +507,26 @@ static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
     else if (dd->comm->bCartesianPP)
     {
 #ifdef GMX_MPI
-        MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
+        MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 #endif
     }
     else
     {
         ddnodeid = ddindex;
     }
-    
+
     return ddnodeid;
 }
 
-static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
+static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 {
     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 }
 
-int ddglatnr(gmx_domdec_t *dd,int i)
+int ddglatnr(gmx_domdec_t *dd, int i)
 {
     int atnr;
-    
+
     if (dd == NULL)
     {
         atnr = i + 1;
@@ -499,11 +535,11 @@ int ddglatnr(gmx_domdec_t *dd,int i)
     {
         if (i >= dd->comm->nat[ddnatNR-1])
         {
-            gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
+            gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
         }
         atnr = dd->gatindex[i] + 1;
     }
-    
+
     return atnr;
 }
 
@@ -518,35 +554,35 @@ static void vec_rvec_init(vec_rvec_t *v)
     v->v      = NULL;
 }
 
-static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
+static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 {
     if (n > v->nalloc)
     {
         v->nalloc = over_alloc_dd(n);
-        srenew(v->v,v->nalloc);
+        srenew(v->v, v->nalloc);
     }
 }
 
-void dd_store_state(gmx_domdec_t *dd,t_state *state)
+void dd_store_state(gmx_domdec_t *dd, t_state *state)
 {
     int i;
-    
+
     if (state->ddp_count != dd->ddp_count)
     {
         gmx_incons("The state does not the domain decomposition state");
     }
-    
+
     state->ncg_gl = dd->ncg_home;
     if (state->ncg_gl > state->cg_gl_nalloc)
     {
         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
-        srenew(state->cg_gl,state->cg_gl_nalloc);
+        srenew(state->cg_gl, state->cg_gl_nalloc);
     }
-    for(i=0; i<state->ncg_gl; i++)
+    for (i = 0; i < state->ncg_gl; i++)
     {
         state->cg_gl[i] = dd->index_gl[i];
     }
-    
+
     state->ddp_count_cg_gl = dd->ddp_count;
 }
 
@@ -555,11 +591,11 @@ gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
     return &dd->comm->zones;
 }
 
-void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
-                      int *jcg0,int *jcg1,ivec shift0,ivec shift1)
+void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
+                      int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 {
     gmx_domdec_zones_t *zones;
-    int izone,d,dim;
+    int                 izone, d, dim;
 
     zones = &dd->comm->zones;
 
@@ -568,7 +604,7 @@ void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
     {
         izone++;
     }
-    
+
     if (izone == 0)
     {
         *jcg0 = icg;
@@ -579,15 +615,15 @@ void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
     }
     else
     {
-        gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
-                  icg,izone,zones->nizone);
+        gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
+                  icg, izone, zones->nizone);
     }
-        
+
     *jcg1 = zones->izone[izone].jcg1;
-    
-    for(d=0; d<dd->ndim; d++)
+
+    for (d = 0; d < dd->ndim; d++)
     {
-        dim = dd->dim[d];
+        dim         = dd->dim[d];
         shift0[dim] = zones->izone[izone].shift0[dim];
         shift1[dim] = zones->izone[izone].shift1[dim];
         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
@@ -604,78 +640,78 @@ int dd_natoms_vsite(gmx_domdec_t *dd)
     return dd->comm->nat[ddnatVSITE];
 }
 
-void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
+void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 {
     *at_start = dd->comm->nat[ddnatCON-1];
     *at_end   = dd->comm->nat[ddnatCON];
 }
 
-void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
+void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 {
-    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
-    int  *index,*cgindex;
-    gmx_domdec_comm_t *comm;
+    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
+    int                   *index, *cgindex;
+    gmx_domdec_comm_t     *comm;
     gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t *ind;
-    rvec shift={0,0,0},*buf,*rbuf;
-    gmx_bool bPBC,bScrew;
-    
+    gmx_domdec_ind_t      *ind;
+    rvec                   shift = {0, 0, 0}, *buf, *rbuf;
+    gmx_bool               bPBC, bScrew;
+
     comm = dd->comm;
-    
+
     cgindex = dd->cgindex;
-    
+
     buf = comm->vbuf.v;
 
-    nzone = 1;
+    nzone   = 1;
     nat_tot = dd->nat_home;
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
         bPBC   = (dd->ci[dd->dim[d]] == 0);
         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
         if (bPBC)
         {
-            copy_rvec(box[dd->dim[d]],shift);
+            copy_rvec(box[dd->dim[d]], shift);
         }
         cd = &comm->cd[d];
-        for(p=0; p<cd->np; p++)
+        for (p = 0; p < cd->np; p++)
         {
-            ind = &cd->ind[p];
+            ind   = &cd->ind[p];
             index = ind->index;
-            n = 0;
+            n     = 0;
             if (!bPBC)
             {
-                for(i=0; i<ind->nsend[nzone]; i++)
+                for (i = 0; i < ind->nsend[nzone]; i++)
                 {
                     at0 = cgindex[index[i]];
                     at1 = cgindex[index[i]+1];
-                    for(j=at0; j<at1; j++)
+                    for (j = at0; j < at1; j++)
                     {
-                        copy_rvec(x[j],buf[n]);
+                        copy_rvec(x[j], buf[n]);
                         n++;
                     }
                 }
             }
             else if (!bScrew)
             {
-                for(i=0; i<ind->nsend[nzone]; i++)
+                for (i = 0; i < ind->nsend[nzone]; i++)
                 {
                     at0 = cgindex[index[i]];
                     at1 = cgindex[index[i]+1];
-                    for(j=at0; j<at1; j++)
+                    for (j = at0; j < at1; j++)
                     {
                         /* We need to shift the coordinates */
-                        rvec_add(x[j],shift,buf[n]);
+                        rvec_add(x[j], shift, buf[n]);
                         n++;
                     }
                 }
             }
             else
             {
-                for(i=0; i<ind->nsend[nzone]; i++)
+                for (i = 0; i < ind->nsend[nzone]; i++)
                 {
                     at0 = cgindex[index[i]];
                     at1 = cgindex[index[i]+1];
-                    for(j=at0; j<at1; j++)
+                    for (j = at0; j < at1; j++)
                     {
                         /* Shift x */
                         buf[n][XX] = x[j][XX] + shift[XX];
@@ -689,7 +725,7 @@ void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
                     }
                 }
             }
-            
+
             if (cd->bInPlace)
             {
                 rbuf = x + nat_tot;
@@ -705,11 +741,11 @@ void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
             if (!cd->bInPlace)
             {
                 j = 0;
-                for(zone=0; zone<nzone; zone++)
+                for (zone = 0; zone < nzone; zone++)
                 {
-                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
+                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
                     {
-                        copy_rvec(rbuf[j],x[i]);
+                        copy_rvec(rbuf[j], x[i]);
                         j++;
                     }
                 }
@@ -720,28 +756,28 @@ void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
     }
 }
 
-void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
+void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 {
-    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
-    int  *index,*cgindex;
-    gmx_domdec_comm_t *comm;
+    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
+    int                   *index, *cgindex;
+    gmx_domdec_comm_t     *comm;
     gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t *ind;
-    rvec *buf,*sbuf;
-    ivec vis;
-    int  is;
-    gmx_bool bPBC,bScrew;
-    
+    gmx_domdec_ind_t      *ind;
+    rvec                  *buf, *sbuf;
+    ivec                   vis;
+    int                    is;
+    gmx_bool               bPBC, bScrew;
+
     comm = dd->comm;
-    
+
     cgindex = dd->cgindex;
 
     buf = comm->vbuf.v;
 
-    n = 0;
-    nzone = comm->zones.n/2;
+    n       = 0;
+    nzone   = comm->zones.n/2;
     nat_tot = dd->nat_tot;
-    for(d=dd->ndim-1; d>=0; d--)
+    for (d = dd->ndim-1; d >= 0; d--)
     {
         bPBC   = (dd->ci[dd->dim[d]] == 0);
         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
@@ -752,11 +788,12 @@ void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
         /* Determine which shift vector we need */
         clear_ivec(vis);
         vis[dd->dim[d]] = 1;
-        is = IVEC2IS(vis);
-        
+        is              = IVEC2IS(vis);
+
         cd = &comm->cd[d];
-        for(p=cd->np-1; p>=0; p--) {
-            ind = &cd->ind[p];
+        for (p = cd->np-1; p >= 0; p--)
+        {
+            ind      = &cd->ind[p];
             nat_tot -= ind->nrecv[nzone+1];
             if (cd->bInPlace)
             {
@@ -765,12 +802,12 @@ void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
             else
             {
                 sbuf = comm->vbuf2.v;
-                j = 0;
-                for(zone=0; zone<nzone; zone++)
+                j    = 0;
+                for (zone = 0; zone < nzone; zone++)
                 {
-                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
+                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
                     {
-                        copy_rvec(f[i],sbuf[j]);
+                        copy_rvec(f[i], sbuf[j]);
                         j++;
                     }
                 }
@@ -784,39 +821,39 @@ void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
             n = 0;
             if (!bPBC)
             {
-                for(i=0; i<ind->nsend[nzone]; i++)
+                for (i = 0; i < ind->nsend[nzone]; i++)
                 {
                     at0 = cgindex[index[i]];
                     at1 = cgindex[index[i]+1];
-                    for(j=at0; j<at1; j++)
+                    for (j = at0; j < at1; j++)
                     {
-                        rvec_inc(f[j],buf[n]);
+                        rvec_inc(f[j], buf[n]);
                         n++;
                     }
-                } 
+                }
             }
             else if (!bScrew)
             {
-                for(i=0; i<ind->nsend[nzone]; i++)
+                for (i = 0; i < ind->nsend[nzone]; i++)
                 {
                     at0 = cgindex[index[i]];
                     at1 = cgindex[index[i]+1];
-                    for(j=at0; j<at1; j++)
+                    for (j = at0; j < at1; j++)
                     {
-                        rvec_inc(f[j],buf[n]);
+                        rvec_inc(f[j], buf[n]);
                         /* Add this force to the shift force */
-                        rvec_inc(fshift[is],buf[n]);
+                        rvec_inc(fshift[is], buf[n]);
                         n++;
                     }
                 }
             }
             else
             {
-                for(i=0; i<ind->nsend[nzone]; i++)
+                for (i = 0; i < ind->nsend[nzone]; i++)
                 {
                     at0 = cgindex[index[i]];
                     at1 = cgindex[index[i]+1];
-                    for(j=at0; j<at1; j++)
+                    for (j = at0; j < at1; j++)
                     {
                         /* Rotate the force */
                         f[j][XX] += buf[n][XX];
@@ -825,7 +862,7 @@ void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
                         if (fshift)
                         {
                             /* Add this force to the shift force */
-                            rvec_inc(fshift[is],buf[n]);
+                            rvec_inc(fshift[is], buf[n]);
                         }
                         n++;
                     }
@@ -836,42 +873,42 @@ void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
     }
 }
 
-void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
+void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 {
-    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
-    int  *index,*cgindex;
-    gmx_domdec_comm_t *comm;
+    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
+    int                   *index, *cgindex;
+    gmx_domdec_comm_t     *comm;
     gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t *ind;
-    real *buf,*rbuf;
-    
+    gmx_domdec_ind_t      *ind;
+    real                  *buf, *rbuf;
+
     comm = dd->comm;
-    
+
     cgindex = dd->cgindex;
-    
+
     buf = &comm->vbuf.v[0][0];
 
-    nzone = 1;
+    nzone   = 1;
     nat_tot = dd->nat_home;
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
         cd = &comm->cd[d];
-        for(p=0; p<cd->np; p++)
+        for (p = 0; p < cd->np; p++)
         {
-            ind = &cd->ind[p];
+            ind   = &cd->ind[p];
             index = ind->index;
-            n = 0;
-            for(i=0; i<ind->nsend[nzone]; i++)
+            n     = 0;
+            for (i = 0; i < ind->nsend[nzone]; i++)
             {
                 at0 = cgindex[index[i]];
                 at1 = cgindex[index[i]+1];
-                for(j=at0; j<at1; j++)
+                for (j = at0; j < at1; j++)
                 {
                     buf[n] = v[j];
                     n++;
                 }
             }
-            
+
             if (cd->bInPlace)
             {
                 rbuf = v + nat_tot;
@@ -887,9 +924,9 @@ void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
             if (!cd->bInPlace)
             {
                 j = 0;
-                for(zone=0; zone<nzone; zone++)
+                for (zone = 0; zone < nzone; zone++)
                 {
-                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
+                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
                     {
                         v[i] = rbuf[j];
                         j++;
@@ -902,29 +939,30 @@ void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
     }
 }
 
-void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
+void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 {
-    int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
-    int  *index,*cgindex;
-    gmx_domdec_comm_t *comm;
+    int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
+    int                   *index, *cgindex;
+    gmx_domdec_comm_t     *comm;
     gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t *ind;
-    real *buf,*sbuf;
-    
+    gmx_domdec_ind_t      *ind;
+    real                  *buf, *sbuf;
+
     comm = dd->comm;
-    
+
     cgindex = dd->cgindex;
 
     buf = &comm->vbuf.v[0][0];
 
-    n = 0;
-    nzone = comm->zones.n/2;
+    n       = 0;
+    nzone   = comm->zones.n/2;
     nat_tot = dd->nat_tot;
-    for(d=dd->ndim-1; d>=0; d--)
+    for (d = dd->ndim-1; d >= 0; d--)
     {
         cd = &comm->cd[d];
-        for(p=cd->np-1; p>=0; p--) {
-            ind = &cd->ind[p];
+        for (p = cd->np-1; p >= 0; p--)
+        {
+            ind      = &cd->ind[p];
             nat_tot -= ind->nrecv[nzone+1];
             if (cd->bInPlace)
             {
@@ -933,10 +971,10 @@ void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
             else
             {
                 sbuf = &comm->vbuf2.v[0][0];
-                j = 0;
-                for(zone=0; zone<nzone; zone++)
+                j    = 0;
+                for (zone = 0; zone < nzone; zone++)
                 {
-                    for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
+                    for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
                     {
                         sbuf[j] = v[i];
                         j++;
@@ -950,28 +988,28 @@ void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
             index = ind->index;
             /* Add the received forces */
             n = 0;
-            for(i=0; i<ind->nsend[nzone]; i++)
+            for (i = 0; i < ind->nsend[nzone]; i++)
             {
                 at0 = cgindex[index[i]];
                 at1 = cgindex[index[i]+1];
-                for(j=at0; j<at1; j++)
+                for (j = at0; j < at1; j++)
                 {
                     v[j] += buf[n];
                     n++;
                 }
-            } 
+            }
         }
         nzone /= 2;
     }
 }
 
-static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
+static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 {
-    fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
-            d,i,j,
-            zone->min0,zone->max1,
-            zone->mch0,zone->mch0,
-            zone->p1_0,zone->p1_1);
+    fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
+            d, i, j,
+            zone->min0, zone->max1,
+            zone->mch0, zone->mch0,
+            zone->p1_0, zone->p1_1);
 }
 
 
@@ -979,16 +1017,16 @@ static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 #define DDZONECOMM_BUFSIZE  3
 
 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
-                               int ddimind,int direction,
-                               gmx_ddzone_t *buf_s,int n_s,
-                               gmx_ddzone_t *buf_r,int n_r)
+                               int ddimind, int direction,
+                               gmx_ddzone_t *buf_s, int n_s,
+                               gmx_ddzone_t *buf_r, int n_r)
 {
 #define ZBS  DDZONECOMM_BUFSIZE
     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
-    int i;
+    int  i;
 
-    for(i=0; i<n_s; i++)
+    for (i = 0; i < n_s; i++)
     {
         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
@@ -1005,7 +1043,7 @@ static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
                      vbuf_s, n_s*ZBS,
                      vbuf_r, n_r*ZBS);
 
-    for(i=0; i<n_r; i++)
+    for (i = 0; i < n_r; i++)
     {
         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
@@ -1019,26 +1057,26 @@ static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 #undef ZBS
 }
 
-static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
-                          rvec cell_ns_x0,rvec cell_ns_x1)
+static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
+                          rvec cell_ns_x0, rvec cell_ns_x1)
 {
-    int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
-    gmx_ddzone_t *zp;
-    gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
-    gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
-    gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
-    rvec extr_s[2],extr_r[2];
-    rvec dh;
-    real dist_d,c=0,det;
+    int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
+    gmx_ddzone_t      *zp;
+    gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
+    gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
+    gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
+    rvec               extr_s[2], extr_r[2];
+    rvec               dh;
+    real               dist_d, c = 0, det;
     gmx_domdec_comm_t *comm;
-    gmx_bool bPBC,bUse;
+    gmx_bool           bPBC, bUse;
 
     comm = dd->comm;
 
-    for(d=1; d<dd->ndim; d++)
+    for (d = 1; d < dd->ndim; d++)
     {
-        dim = dd->dim[d];
-        zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
+        dim      = dd->dim[d];
+        zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
         zp->min0 = cell_ns_x0[dim];
         zp->max1 = cell_ns_x1[dim];
         zp->min1 = cell_ns_x1[dim];
@@ -1047,8 +1085,8 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
         zp->p1_0 = cell_ns_x0[dim];
         zp->p1_1 = cell_ns_x1[dim];
     }
-    
-    for(d=dd->ndim-2; d>=0; d--)
+
+    for (d = dd->ndim-2; d >= 0; d--)
     {
         dim  = dd->dim[d];
         bPBC = (dim < ddbox->npbcdim);
@@ -1062,7 +1100,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
         /* Store the extremes in the backward sending buffer,
          * so the get updated separately from the forward communication.
          */
-        for(d1=d; d1<dd->ndim-1; d1++)
+        for (d1 = d; d1 < dd->ndim-1; d1++)
         {
             /* We invert the order to be able to use the same loop for buf_e */
             buf_s[pos].min0 = extr_s[d1][1];
@@ -1094,7 +1132,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
         if (bPBC)
         {
             /* Take the minimum to avoid double communication */
-            npulse_min = min(npulse,dd->nc[dim]-1-npulse);
+            npulse_min = min(npulse, dd->nc[dim]-1-npulse);
         }
         else
         {
@@ -1105,7 +1143,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
              */
             npulse_min = npulse;
         }
-        for(p=0; p<npulse_min; p++)
+        for (p = 0; p < npulse_min; p++)
         {
             /* Communicate the extremes forward */
             bUse = (bPBC || dd->ci[dim] > 0);
@@ -1116,17 +1154,17 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 
             if (bUse)
             {
-                for(d1=d; d1<dd->ndim-1; d1++)
+                for (d1 = d; d1 < dd->ndim-1; d1++)
                 {
-                    extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
-                    extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
-                    extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
+                    extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
+                    extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
+                    extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
                 }
             }
         }
 
         buf_size = pos;
-        for(p=0; p<npulse; p++)
+        for (p = 0; p < npulse; p++)
         {
             /* Communicate all the zone information backward */
             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
@@ -1138,7 +1176,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
             clear_rvec(dh);
             if (p > 0)
             {
-                for(d1=d+1; d1<dd->ndim; d1++)
+                for (d1 = d+1; d1 < dd->ndim; d1++)
                 {
                     /* Determine the decrease of maximum required
                      * communication height along d1 due to the distance along d,
@@ -1171,7 +1209,7 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
             }
 
             /* Accumulate the extremes over all pulses */
-            for(i=0; i<buf_size; i++)
+            for (i = 0; i < buf_size; i++)
             {
                 if (p == 0)
                 {
@@ -1181,9 +1219,9 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
                 {
                     if (bUse)
                     {
-                        buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
-                        buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
-                        buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
+                        buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
+                        buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
+                        buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
                     }
 
                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
@@ -1196,8 +1234,8 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
                     }
                     if (bUse && dh[d1] >= 0)
                     {
-                        buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
-                        buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
+                        buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
+                        buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
                     }
                 }
                 /* Copy the received buffer to the send buffer,
@@ -1208,20 +1246,20 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
             {
-                /* Store the extremes */ 
+                /* Store the extremes */
                 pos = 0;
 
-                for(d1=d; d1<dd->ndim-1; d1++)
+                for (d1 = d; d1 < dd->ndim-1; d1++)
                 {
-                    extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
-                    extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
-                    extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
+                    extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
+                    extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
+                    extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
                     pos++;
                 }
 
                 if (d == 1 || (d == 0 && dd->ndim == 3))
                 {
-                    for(i=d; i<2; i++)
+                    for (i = d; i < 2; i++)
                     {
                         comm->zone_d2[1-d][i] = buf_e[pos];
                         pos++;
@@ -1235,75 +1273,80 @@ static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
             }
         }
     }
-    
+
     if (dd->ndim >= 2)
     {
         dim = dd->dim[1];
-        for(i=0; i<2; i++)
+        for (i = 0; i < 2; i++)
         {
             if (debug)
             {
-                print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
+                print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
             }
-            cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
-            cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
+            cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
+            cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
         }
     }
     if (dd->ndim >= 3)
     {
         dim = dd->dim[2];
-        for(i=0; i<2; i++)
+        for (i = 0; i < 2; i++)
         {
-            for(j=0; j<2; j++)
+            for (j = 0; j < 2; j++)
             {
                 if (debug)
                 {
-                    print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
+                    print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
                 }
-                cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
-                cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
+                cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
+                cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
             }
         }
     }
-    for(d=1; d<dd->ndim; d++)
+    for (d = 1; d < dd->ndim; d++)
     {
         comm->cell_f_max0[d] = extr_s[d-1][0];
         comm->cell_f_min1[d] = extr_s[d-1][1];
         if (debug)
         {
-            fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
-                    d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
+            fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
+                    d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
         }
     }
 }
 
 static void dd_collect_cg(gmx_domdec_t *dd,
-                          t_state *state_local)
+                          t_state      *state_local)
 {
-    gmx_domdec_master_t *ma=NULL;
-    int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
-    t_block *cgs_gl;
+    gmx_domdec_master_t *ma = NULL;
+    int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
 
     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
     {
         /* The master has the correct distribution */
         return;
     }
-    
+
     if (state_local->ddp_count == dd->ddp_count)
     {
+        /* The local state and DD are in sync, use the DD indices */
         ncg_home = dd->ncg_home;
         cg       = dd->index_gl;
         nat_home = dd->nat_home;
-    } 
+    }
     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
     {
+        /* The DD is out of sync with the local state, but we have stored
+         * the cg indices with the local state, so we can use those.
+         */
+        t_block *cgs_gl;
+
         cgs_gl = &dd->comm->cgs_gl;
 
         ncg_home = state_local->ncg_gl;
         cg       = state_local->cg_gl;
         nat_home = 0;
-        for(i=0; i<ncg_home; i++)
+        for (i = 0; i < ncg_home; i++)
         {
             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
         }
@@ -1312,12 +1355,12 @@ static void dd_collect_cg(gmx_domdec_t *dd,
     {
         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
     }
-    
-    buf2[0] = dd->ncg_home;
-    buf2[1] = dd->nat_home;
+
+    buf2[0] = ncg_home;
+    buf2[1] = nat_home;
     if (DDMASTER(dd))
     {
-        ma = dd->ma;
+        ma   = dd->ma;
         ibuf = ma->ibuf;
     }
     else
@@ -1325,92 +1368,96 @@ static void dd_collect_cg(gmx_domdec_t *dd,
         ibuf = NULL;
     }
     /* Collect the charge group and atom counts on the master */
-    dd_gather(dd,2*sizeof(int),buf2,ibuf);
-    
+    dd_gather(dd, 2*sizeof(int), buf2, ibuf);
+
     if (DDMASTER(dd))
     {
         ma->index[0] = 0;
-        for(i=0; i<dd->nnodes; i++)
+        for (i = 0; i < dd->nnodes; i++)
         {
-            ma->ncg[i] = ma->ibuf[2*i];
-            ma->nat[i] = ma->ibuf[2*i+1];
+            ma->ncg[i]     = ma->ibuf[2*i];
+            ma->nat[i]     = ma->ibuf[2*i+1];
             ma->index[i+1] = ma->index[i] + ma->ncg[i];
-            
+
         }
         /* Make byte counts and indices */
-        for(i=0; i<dd->nnodes; i++)
+        for (i = 0; i < dd->nnodes; i++)
         {
-            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
+            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
         }
         if (debug)
         {
-            fprintf(debug,"Initial charge group distribution: ");
-            for(i=0; i<dd->nnodes; i++)
-                fprintf(debug," %d",ma->ncg[i]);
-            fprintf(debug,"\n");
+            fprintf(debug, "Initial charge group distribution: ");
+            for (i = 0; i < dd->nnodes; i++)
+            {
+                fprintf(debug, " %d", ma->ncg[i]);
+            }
+            fprintf(debug, "\n");
         }
     }
-    
+
     /* Collect the charge group indices on the master */
     dd_gatherv(dd,
-               dd->ncg_home*sizeof(int),dd->index_gl,
+               ncg_home*sizeof(int), cg,
                DDMASTER(dd) ? ma->ibuf : NULL,
                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
                DDMASTER(dd) ? ma->cg : NULL);
-    
+
     dd->comm->master_cg_ddp_count = state_local->ddp_count;
 }
 
 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
-                                    rvec *lv,rvec *v)
+                                    rvec *lv, rvec *v)
 {
     gmx_domdec_master_t *ma;
-    int  n,i,c,a,nalloc=0;
-    rvec *buf=NULL;
-    t_block *cgs_gl;
+    int                  n, i, c, a, nalloc = 0;
+    rvec                *buf = NULL;
+    t_block             *cgs_gl;
 
     ma = dd->ma;
-    
+
     if (!DDMASTER(dd))
     {
 #ifdef GMX_MPI
-        MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
-                 dd->rank,dd->mpi_comm_all);
+        MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
+                 dd->rank, dd->mpi_comm_all);
 #endif
-    } else {
+    }
+    else
+    {
         /* Copy the master coordinates to the global array */
         cgs_gl = &dd->comm->cgs_gl;
 
         n = DDMASTERRANK(dd);
         a = 0;
-        for(i=ma->index[n]; i<ma->index[n+1]; i++)
+        for (i = ma->index[n]; i < ma->index[n+1]; i++)
         {
-            for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
+            for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
             {
-                copy_rvec(lv[a++],v[c]);
+                copy_rvec(lv[a++], v[c]);
             }
         }
-        
-        for(n=0; n<dd->nnodes; n++)
+
+        for (n = 0; n < dd->nnodes; n++)
         {
             if (n != dd->rank)
             {
                 if (ma->nat[n] > nalloc)
                 {
                     nalloc = over_alloc_dd(ma->nat[n]);
-                    srenew(buf,nalloc);
+                    srenew(buf, nalloc);
                 }
 #ifdef GMX_MPI
-                MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
-                         n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
+                MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
+                         n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 #endif
                 a = 0;
-                for(i=ma->index[n]; i<ma->index[n+1]; i++)
+                for (i = ma->index[n]; i < ma->index[n+1]; i++)
                 {
-                    for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
+                    for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
                     {
-                        copy_rvec(buf[a++],v[c]);
+                        copy_rvec(buf[a++], v[c]);
                     }
                 }
             }
@@ -1420,17 +1467,17 @@ static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
 }
 
 static void get_commbuffer_counts(gmx_domdec_t *dd,
-                                  int **counts,int **disps)
+                                  int **counts, int **disps)
 {
     gmx_domdec_master_t *ma;
-    int n;
+    int                  n;
 
     ma = dd->ma;
-    
+
     /* Make the rvec count and displacment arrays */
     *counts  = ma->ibuf;
     *disps   = ma->ibuf + dd->nnodes;
-    for(n=0; n<dd->nnodes; n++)
+    for (n = 0; n < dd->nnodes; n++)
     {
         (*counts)[n] = ma->nat[n]*sizeof(rvec);
         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
@@ -1438,37 +1485,37 @@ static void get_commbuffer_counts(gmx_domdec_t *dd,
 }
 
 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
-                                   rvec *lv,rvec *v)
+                                   rvec *lv, rvec *v)
 {
     gmx_domdec_master_t *ma;
-    int  *rcounts=NULL,*disps=NULL;
-    int  n,i,c,a;
-    rvec *buf=NULL;
-    t_block *cgs_gl;
-    
+    int                 *rcounts = NULL, *disps = NULL;
+    int                  n, i, c, a;
+    rvec                *buf = NULL;
+    t_block             *cgs_gl;
+
     ma = dd->ma;
-    
+
     if (DDMASTER(dd))
     {
-        get_commbuffer_counts(dd,&rcounts,&disps);
+        get_commbuffer_counts(dd, &rcounts, &disps);
 
         buf = ma->vbuf;
     }
-    
-    dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
+
+    dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
 
     if (DDMASTER(dd))
     {
         cgs_gl = &dd->comm->cgs_gl;
 
         a = 0;
-        for(n=0; n<dd->nnodes; n++)
+        for (n = 0; n < dd->nnodes; n++)
         {
-            for(i=ma->index[n]; i<ma->index[n+1]; i++)
+            for (i = ma->index[n]; i < ma->index[n+1]; i++)
             {
-                for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
+                for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
                 {
-                    copy_rvec(buf[a++],v[c]);
+                    copy_rvec(buf[a++], v[c]);
                 }
             }
         }
@@ -1476,185 +1523,156 @@ static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
 }
 
 void dd_collect_vec(gmx_domdec_t *dd,
-                    t_state *state_local,rvec *lv,rvec *v)
+                    t_state *state_local, rvec *lv, rvec *v)
 {
     gmx_domdec_master_t *ma;
-    int  n,i,c,a,nalloc=0;
-    rvec *buf=NULL;
-    
-    dd_collect_cg(dd,state_local);
+    int                  n, i, c, a, nalloc = 0;
+    rvec                *buf = NULL;
+
+    dd_collect_cg(dd, state_local);
 
     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
     {
-        dd_collect_vec_sendrecv(dd,lv,v);
+        dd_collect_vec_sendrecv(dd, lv, v);
     }
     else
     {
-        dd_collect_vec_gatherv(dd,lv,v);
+        dd_collect_vec_gatherv(dd, lv, v);
     }
 }
 
 
 void dd_collect_state(gmx_domdec_t *dd,
-                      t_state *state_local,t_state *state)
+                      t_state *state_local, t_state *state)
 {
-    int est,i,j,nh;
+    int est, i, j, nh;
 
     nh = state->nhchainlength;
 
     if (DDMASTER(dd))
     {
-        for (i=0;i<efptNR;i++) {
+        for (i = 0; i < efptNR; i++)
+        {
             state->lambda[i] = state_local->lambda[i];
         }
         state->fep_state = state_local->fep_state;
-        state->veta = state_local->veta;
-        state->vol0 = state_local->vol0;
-        copy_mat(state_local->box,state->box);
-        copy_mat(state_local->boxv,state->boxv);
-        copy_mat(state_local->svir_prev,state->svir_prev);
-        copy_mat(state_local->fvir_prev,state->fvir_prev);
-        copy_mat(state_local->pres_prev,state->pres_prev);
-
+        state->veta      = state_local->veta;
+        state->vol0      = state_local->vol0;
+        copy_mat(state_local->box, state->box);
+        copy_mat(state_local->boxv, state->boxv);
+        copy_mat(state_local->svir_prev, state->svir_prev);
+        copy_mat(state_local->fvir_prev, state->fvir_prev);
+        copy_mat(state_local->pres_prev, state->pres_prev);
 
-        for(i=0; i<state_local->ngtc; i++)
+        for (i = 0; i < state_local->ngtc; i++)
         {
-            for(j=0; j<nh; j++) {
+            for (j = 0; j < nh; j++)
+            {
                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
             }
-            state->therm_integral[i] = state_local->therm_integral[i];            
+            state->therm_integral[i] = state_local->therm_integral[i];
         }
-        for(i=0; i<state_local->nnhpres; i++) 
+        for (i = 0; i < state_local->nnhpres; i++)
         {
-            for(j=0; j<nh; j++) {
+            for (j = 0; j < nh; j++)
+            {
                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
             }
         }
     }
-    for(est=0; est<estNR; est++)
+    for (est = 0; est < estNR; est++)
     {
         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
         {
-            switch (est) {
-            case estX:
-                dd_collect_vec(dd,state_local,state_local->x,state->x);
-                break;
-            case estV:
-                dd_collect_vec(dd,state_local,state_local->v,state->v);
-                break;
-            case estSDX:
-                dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
-                break;
-            case estCGP:
-                dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
-                break;
-            case estLD_RNG:
-                if (state->nrngi == 1)
-                {
-                    if (DDMASTER(dd))
-                    {
-                        for(i=0; i<state_local->nrng; i++)
-                        {
-                            state->ld_rng[i] = state_local->ld_rng[i];
-                        }
-                    }
-                }
-                else
-                {
-                    dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
-                              state_local->ld_rng,state->ld_rng);
-                }
-                break;
-            case estLD_RNGI:
-                if (state->nrngi == 1)
-                {
-                   if (DDMASTER(dd))
-                    {
-                        state->ld_rngi[0] = state_local->ld_rngi[0];
-                    } 
-                }
-                else
-                {
-                    dd_gather(dd,sizeof(state->ld_rngi[0]),
-                              state_local->ld_rngi,state->ld_rngi);
-                }
-                break;
-            case estDISRE_INITF:
-            case estDISRE_RM3TAV:
-            case estORIRE_INITF:
-            case estORIRE_DTAV:
-                break;
-            default:
-                gmx_incons("Unknown state entry encountered in dd_collect_state");
+            switch (est)
+            {
+                case estX:
+                    dd_collect_vec(dd, state_local, state_local->x, state->x);
+                    break;
+                case estV:
+                    dd_collect_vec(dd, state_local, state_local->v, state->v);
+                    break;
+                case estSDX:
+                    dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
+                    break;
+                case estCGP:
+                    dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
+                    break;
+                case estDISRE_INITF:
+                case estDISRE_RM3TAV:
+                case estORIRE_INITF:
+                case estORIRE_DTAV:
+                    break;
+                default:
+                    gmx_incons("Unknown state entry encountered in dd_collect_state");
             }
         }
     }
 }
 
-static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
+static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
 {
     int est;
 
     if (debug)
     {
-        fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
+        fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
     }
 
     state->nalloc = over_alloc_dd(nalloc);
-    
-    for(est=0; est<estNR; est++)
+
+    for (est = 0; est < estNR; est++)
     {
         if (EST_DISTR(est) && (state->flags & (1<<est)))
         {
-            switch(est) {
-            case estX:
-                srenew(state->x,state->nalloc);
-                break;
-            case estV:
-                srenew(state->v,state->nalloc);
-                break;
-            case estSDX:
-                srenew(state->sd_X,state->nalloc);
-                break;
-            case estCGP:
-                srenew(state->cg_p,state->nalloc);
-                break;
-            case estLD_RNG:
-            case estLD_RNGI:
-            case estDISRE_INITF:
-            case estDISRE_RM3TAV:
-            case estORIRE_INITF:
-            case estORIRE_DTAV:
-                /* No reallocation required */
-                break;
-            default:
-                gmx_incons("Unknown state entry encountered in dd_realloc_state");            
+            switch (est)
+            {
+                case estX:
+                    srenew(state->x, state->nalloc);
+                    break;
+                case estV:
+                    srenew(state->v, state->nalloc);
+                    break;
+                case estSDX:
+                    srenew(state->sd_X, state->nalloc);
+                    break;
+                case estCGP:
+                    srenew(state->cg_p, state->nalloc);
+                    break;
+                case estDISRE_INITF:
+                case estDISRE_RM3TAV:
+                case estORIRE_INITF:
+                case estORIRE_DTAV:
+                    /* No reallocation required */
+                    break;
+                default:
+                    gmx_incons("Unknown state entry encountered in dd_realloc_state");
             }
         }
     }
-    
+
     if (f != NULL)
     {
-        srenew(*f,state->nalloc);
+        srenew(*f, state->nalloc);
     }
 }
 
-static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
+static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
                                int nalloc)
 {
     if (nalloc > fr->cg_nalloc)
     {
         if (debug)
         {
-            fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
+            fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
         }
         fr->cg_nalloc = over_alloc_dd(nalloc);
-        srenew(fr->cginfo,fr->cg_nalloc);
+        srenew(fr->cginfo, fr->cg_nalloc);
         if (fr->cutoff_scheme == ecutsGROUP)
         {
-            srenew(fr->cg_cm,fr->cg_nalloc);
+            srenew(fr->cg_cm, fr->cg_nalloc);
         }
     }
     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
@@ -1662,222 +1680,232 @@ static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
         /* We don't use charge groups, we use x in state to set up
          * the atom communication.
          */
-        dd_realloc_state(state,f,nalloc);
+        dd_realloc_state(state, f, nalloc);
     }
 }
 
-static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
-                                       rvec *v,rvec *lv)
+static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
+                                       rvec *v, rvec *lv)
 {
     gmx_domdec_master_t *ma;
-    int  n,i,c,a,nalloc=0;
-    rvec *buf=NULL;
-    
+    int                  n, i, c, a, nalloc = 0;
+    rvec                *buf = NULL;
+
     if (DDMASTER(dd))
     {
         ma  = dd->ma;
-        
-        for(n=0; n<dd->nnodes; n++)
+
+        for (n = 0; n < dd->nnodes; n++)
         {
             if (n != dd->rank)
             {
                 if (ma->nat[n] > nalloc)
                 {
                     nalloc = over_alloc_dd(ma->nat[n]);
-                    srenew(buf,nalloc);
+                    srenew(buf, nalloc);
                 }
                 /* Use lv as a temporary buffer */
                 a = 0;
-                for(i=ma->index[n]; i<ma->index[n+1]; i++)
+                for (i = ma->index[n]; i < ma->index[n+1]; i++)
                 {
-                    for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
+                    for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
                     {
-                        copy_rvec(v[c],buf[a++]);
+                        copy_rvec(v[c], buf[a++]);
                     }
                 }
                 if (a != ma->nat[n])
                 {
-                    gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
-                              a,ma->nat[n]);
+                    gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
+                              a, ma->nat[n]);
                 }
-                
+
 #ifdef GMX_MPI
-                MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
-                         DDRANK(dd,n),n,dd->mpi_comm_all);
+                MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
+                         DDRANK(dd, n), n, dd->mpi_comm_all);
 #endif
             }
         }
         sfree(buf);
         n = DDMASTERRANK(dd);
         a = 0;
-        for(i=ma->index[n]; i<ma->index[n+1]; i++)
+        for (i = ma->index[n]; i < ma->index[n+1]; i++)
         {
-            for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
+            for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
             {
-                copy_rvec(v[c],lv[a++]);
+                copy_rvec(v[c], lv[a++]);
             }
         }
     }
     else
     {
 #ifdef GMX_MPI
-        MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
-                 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
+        MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
+                 MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
 #endif
     }
 }
 
-static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
-                                       rvec *v,rvec *lv)
+static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
+                                       rvec *v, rvec *lv)
 {
     gmx_domdec_master_t *ma;
-    int  *scounts=NULL,*disps=NULL;
-    int  n,i,c,a,nalloc=0;
-    rvec *buf=NULL;
-    
+    int                 *scounts = NULL, *disps = NULL;
+    int                  n, i, c, a, nalloc = 0;
+    rvec                *buf = NULL;
+
     if (DDMASTER(dd))
     {
         ma  = dd->ma;
-     
-        get_commbuffer_counts(dd,&scounts,&disps);
+
+        get_commbuffer_counts(dd, &scounts, &disps);
 
         buf = ma->vbuf;
-        a = 0;
-        for(n=0; n<dd->nnodes; n++)
+        a   = 0;
+        for (n = 0; n < dd->nnodes; n++)
         {
-            for(i=ma->index[n]; i<ma->index[n+1]; i++)
+            for (i = ma->index[n]; i < ma->index[n+1]; i++)
             {
-                for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
+                for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
                 {
-                    copy_rvec(v[c],buf[a++]);
+                    copy_rvec(v[c], buf[a++]);
                 }
             }
         }
     }
 
-    dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
+    dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
 }
 
-static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
+static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
 {
     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
     {
-        dd_distribute_vec_sendrecv(dd,cgs,v,lv);
+        dd_distribute_vec_sendrecv(dd, cgs, v, lv);
     }
     else
     {
-        dd_distribute_vec_scatterv(dd,cgs,v,lv);
+        dd_distribute_vec_scatterv(dd, cgs, v, lv);
+    }
+}
+
+static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
+{
+    int i;
+    dd_bcast(dd, sizeof(int), &dfhist->bEquil);
+    dd_bcast(dd, sizeof(int), &dfhist->nlambda);
+    dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
+
+    if (dfhist->nlambda > 0)
+    {
+        int nlam = dfhist->nlambda;
+        dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
+        dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
+        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
+        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
+        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
+        dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
+
+        for (i = 0; i < nlam; i++)
+        {
+            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
+            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
+            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
+            dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
+            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
+            dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
+        }
     }
 }
 
-static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
-                                t_state *state,t_state *state_local,
+static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
+                                t_state *state, t_state *state_local,
                                 rvec **f)
 {
-    int  i,j,nh;
+    int  i, j, nh;
 
     nh = state->nhchainlength;
 
     if (DDMASTER(dd))
     {
-        for(i=0;i<efptNR;i++)
+        for (i = 0; i < efptNR; i++)
         {
             state_local->lambda[i] = state->lambda[i];
         }
         state_local->fep_state = state->fep_state;
-        state_local->veta   = state->veta;
-        state_local->vol0   = state->vol0;
-        copy_mat(state->box,state_local->box);
-        copy_mat(state->box_rel,state_local->box_rel);
-        copy_mat(state->boxv,state_local->boxv);
-        copy_mat(state->svir_prev,state_local->svir_prev);
-        copy_mat(state->fvir_prev,state_local->fvir_prev);
-        for(i=0; i<state_local->ngtc; i++)
-        {
-            for(j=0; j<nh; j++) {
+        state_local->veta      = state->veta;
+        state_local->vol0      = state->vol0;
+        copy_mat(state->box, state_local->box);
+        copy_mat(state->box_rel, state_local->box_rel);
+        copy_mat(state->boxv, state_local->boxv);
+        copy_mat(state->svir_prev, state_local->svir_prev);
+        copy_mat(state->fvir_prev, state_local->fvir_prev);
+        copy_df_history(&state_local->dfhist, &state->dfhist);
+        for (i = 0; i < state_local->ngtc; i++)
+        {
+            for (j = 0; j < nh; j++)
+            {
                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
             }
             state_local->therm_integral[i] = state->therm_integral[i];
         }
-        for(i=0; i<state_local->nnhpres; i++)
+        for (i = 0; i < state_local->nnhpres; i++)
         {
-            for(j=0; j<nh; j++) {
+            for (j = 0; j < nh; j++)
+            {
                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
             }
         }
     }
-    dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
-    dd_bcast(dd,sizeof(int),&state_local->fep_state);
-    dd_bcast(dd,sizeof(real),&state_local->veta);
-    dd_bcast(dd,sizeof(real),&state_local->vol0);
-    dd_bcast(dd,sizeof(state_local->box),state_local->box);
-    dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
-    dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
-    dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
-    dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
-    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
-    dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
-    dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
-    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
-    dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
+    dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
+    dd_bcast(dd, sizeof(int), &state_local->fep_state);
+    dd_bcast(dd, sizeof(real), &state_local->veta);
+    dd_bcast(dd, sizeof(real), &state_local->vol0);
+    dd_bcast(dd, sizeof(state_local->box), state_local->box);
+    dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
+    dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
+    dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
+    dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
+    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
+    dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
+    dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
+    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
+    dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
+
+    /* communicate df_history -- required for restarting from checkpoint */
+    dd_distribute_dfhist(dd, &state_local->dfhist);
 
     if (dd->nat_home > state_local->nalloc)
     {
-        dd_realloc_state(state_local,f,dd->nat_home);
+        dd_realloc_state(state_local, f, dd->nat_home);
     }
-    for(i=0; i<estNR; i++)
+    for (i = 0; i < estNR; i++)
     {
         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
         {
-            switch (i) {
-            case estX:
-                dd_distribute_vec(dd,cgs,state->x,state_local->x);
-                break;
-            case estV:
-                dd_distribute_vec(dd,cgs,state->v,state_local->v);
-                break;
-            case estSDX:
-                dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
-                break;
-            case estCGP:
-                dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
-                break;
-            case estLD_RNG:
-                if (state->nrngi == 1)
-                {
-                    dd_bcastc(dd,
-                              state_local->nrng*sizeof(state_local->ld_rng[0]),
-                              state->ld_rng,state_local->ld_rng);
-                }
-                else
-                {
-                    dd_scatter(dd,
-                               state_local->nrng*sizeof(state_local->ld_rng[0]),
-                               state->ld_rng,state_local->ld_rng);
-                }
-                break;
-            case estLD_RNGI:
-                if (state->nrngi == 1)
-                {
-                    dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
-                              state->ld_rngi,state_local->ld_rngi);
-                }
-                else
-                {
-                     dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
-                               state->ld_rngi,state_local->ld_rngi);
-                }   
-                break;
-            case estDISRE_INITF:
-            case estDISRE_RM3TAV:
-            case estORIRE_INITF:
-            case estORIRE_DTAV:
-                /* Not implemented yet */
-                break;
-            default:
-                gmx_incons("Unknown state entry encountered in dd_distribute_state");
+            switch (i)
+            {
+                case estX:
+                    dd_distribute_vec(dd, cgs, state->x, state_local->x);
+                    break;
+                case estV:
+                    dd_distribute_vec(dd, cgs, state->v, state_local->v);
+                    break;
+                case estSDX:
+                    dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
+                    break;
+                case estCGP:
+                    dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
+                    break;
+                case estDISRE_INITF:
+                case estDISRE_RM3TAV:
+                case estORIRE_INITF:
+                case estORIRE_DTAV:
+                    /* Not implemented yet */
+                    break;
+                default:
+                    gmx_incons("Unknown state entry encountered in dd_distribute_state");
             }
         }
     }
@@ -1885,44 +1913,44 @@ static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
 
 static char dim2char(int dim)
 {
-    char c='?';
-    
+    char c = '?';
+
     switch (dim)
     {
-    case XX: c = 'X'; break;
-    case YY: c = 'Y'; break;
-    case ZZ: c = 'Z'; break;
-    default: gmx_fatal(FARGS,"Unknown dim %d",dim);
+        case XX: c = 'X'; break;
+        case YY: c = 'Y'; break;
+        case ZZ: c = 'Z'; break;
+        default: gmx_fatal(FARGS, "Unknown dim %d", dim);
     }
-    
+
     return c;
 }
 
-static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
-                              gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
+static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
+                              gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
 {
-    rvec grid_s[2],*grid_r=NULL,cx,r;
-    char fname[STRLEN],format[STRLEN],buf[22];
-    FILE *out;
-    int  a,i,d,z,y,x;
+    rvec   grid_s[2], *grid_r = NULL, cx, r;
+    char   fname[STRLEN], buf[22];
+    FILE  *out;
+    int    a, i, d, z, y, x;
     matrix tric;
-    real vol;
+    real   vol;
+
+    copy_rvec(dd->comm->cell_x0, grid_s[0]);
+    copy_rvec(dd->comm->cell_x1, grid_s[1]);
 
-    copy_rvec(dd->comm->cell_x0,grid_s[0]);
-    copy_rvec(dd->comm->cell_x1,grid_s[1]);
-    
     if (DDMASTER(dd))
     {
-        snew(grid_r,2*dd->nnodes);
+        snew(grid_r, 2*dd->nnodes);
     }
-    
-    dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
-    
+
+    dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
+
     if (DDMASTER(dd))
     {
-        for(d=0; d<DIM; d++)
+        for (d = 0; d < DIM; d++)
         {
-            for(i=0; i<DIM; i++)
+            for (i = 0; i < DIM; i++)
             {
                 if (d == i)
                 {
@@ -1941,44 +1969,43 @@ static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
                 }
             }
         }
-        sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
-        sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
-        out = gmx_fio_fopen(fname,"w");
-        gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
+        sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
+        out = gmx_fio_fopen(fname, "w");
+        gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
         a = 1;
-        for(i=0; i<dd->nnodes; i++)
+        for (i = 0; i < dd->nnodes; i++)
         {
             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
-            for(d=0; d<DIM; d++)
+            for (d = 0; d < DIM; d++)
             {
                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
             }
-            for(z=0; z<2; z++)
+            for (z = 0; z < 2; z++)
             {
-                for(y=0; y<2; y++)
+                for (y = 0; y < 2; y++)
                 {
-                    for(x=0; x<2; x++)
+                    for (x = 0; x < 2; x++)
                     {
                         cx[XX] = grid_r[i*2+x][XX];
                         cx[YY] = grid_r[i*2+y][YY];
                         cx[ZZ] = grid_r[i*2+z][ZZ];
-                        mvmul(tric,cx,r);
-                        fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
-                                10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
+                        mvmul(tric, cx, r);
+                        gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
+                                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
                     }
                 }
             }
-            for(d=0; d<DIM; d++)
+            for (d = 0; d < DIM; d++)
             {
-                for(x=0; x<4; x++)
+                for (x = 0; x < 4; x++)
                 {
-                    switch(d)
+                    switch (d)
                     {
-                    case 0: y = 1 + i*8 + 2*x; break;
-                    case 1: y = 1 + i*8 + 2*x - (x % 2); break;
-                    case 2: y = 1 + i*8 + x; break;
+                        case 0: y = 1 + i*8 + 2*x; break;
+                        case 1: y = 1 + i*8 + 2*x - (x % 2); break;
+                        case 2: y = 1 + i*8 + x; break;
                     }
-                    fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
+                    fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
                 }
             }
         }
@@ -1987,36 +2014,33 @@ static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
     }
 }
 
-void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
-                  gmx_mtop_t *mtop,t_commrec *cr,
-                  int natoms,rvec x[],matrix box)
+void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
+                  gmx_mtop_t *mtop, t_commrec *cr,
+                  int natoms, rvec x[], matrix box)
 {
-    char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
-    FILE *out;
-    int  i,ii,resnr,c;
-    char *atomname,*resname;
-    real b;
+    char          fname[STRLEN], buf[22];
+    FILE         *out;
+    int           i, ii, resnr, c;
+    char         *atomname, *resname;
+    real          b;
     gmx_domdec_t *dd;
-    
+
     dd = cr->dd;
     if (natoms == -1)
     {
         natoms = dd->comm->nat[ddnatVSITE];
     }
-    
-    sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
-    
-    sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
-    sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
-    
-    out = gmx_fio_fopen(fname,"w");
-    
-    fprintf(out,"TITLE     %s\n",title);
-    gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
-    for(i=0; i<natoms; i++)
+
+    sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
+
+    out = gmx_fio_fopen(fname, "w");
+
+    fprintf(out, "TITLE     %s\n", title);
+    gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
+    for (i = 0; i < natoms; i++)
     {
         ii = dd->gatindex[i];
-        gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
+        gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
         if (i < dd->comm->nat[ddnatZONE])
         {
             c = 0;
@@ -2034,21 +2058,19 @@ void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
         {
             b = dd->comm->zones.n + 1;
         }
-        fprintf(out,strlen(atomname)<4 ? format : format4,
-                "ATOM",(ii+1)%100000,
-                atomname,resname,' ',resnr%10000,' ',
-                10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
+        gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
+                                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
     }
-    fprintf(out,"TER\n");
-    
+    fprintf(out, "TER\n");
+
     gmx_fio_fclose(out);
 }
 
 real dd_cutoff_mbody(gmx_domdec_t *dd)
 {
     gmx_domdec_comm_t *comm;
-    int  di;
-    real r;
+    int                di;
+    real               r;
 
     comm = dd->comm;
 
@@ -2063,17 +2085,17 @@ real dd_cutoff_mbody(gmx_domdec_t *dd)
         {
             /* cutoff_mbody=0 means we do not have DLB */
             r = comm->cellsize_min[dd->dim[0]];
-            for(di=1; di<dd->ndim; di++)
+            for (di = 1; di < dd->ndim; di++)
             {
-                r = min(r,comm->cellsize_min[dd->dim[di]]);
+                r = min(r, comm->cellsize_min[dd->dim[di]]);
             }
             if (comm->bBondComm)
             {
-                r = max(r,comm->cutoff_mbody);
+                r = max(r, comm->cutoff_mbody);
             }
             else
             {
-                r = min(r,comm->cutoff);
+                r = min(r, comm->cutoff);
             }
         }
     }
@@ -2087,22 +2109,22 @@ real dd_cutoff_twobody(gmx_domdec_t *dd)
 
     r_mb = dd_cutoff_mbody(dd);
 
-    return max(dd->comm->cutoff,r_mb);
+    return max(dd->comm->cutoff, r_mb);
 }
 
 
-static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
+static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
 {
-    int nc,ntot;
-    
+    int nc, ntot;
+
     nc   = dd->nc[dd->comm->cartpmedim];
     ntot = dd->comm->ntot[dd->comm->cartpmedim];
-    copy_ivec(coord,coord_pme);
+    copy_ivec(coord, coord_pme);
     coord_pme[dd->comm->cartpmedim] =
         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
 }
 
-static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
+static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
 {
     /* Here we assign a PME node to communicate with this DD node
      * by assuming that the major index of both is x.
@@ -2111,29 +2133,33 @@ static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
     return (ddindex*npme + npme/2)/ndd;
 }
 
-static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
+static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
 {
-    return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
+    return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
 }
 
-static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
+static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
 {
-    return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
+    return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
 }
 
 static int *dd_pmenodes(t_commrec *cr)
 {
     int *pmenodes;
-    int n,i,p0,p1;
-    
-    snew(pmenodes,cr->npmenodes);
+    int  n, i, p0, p1;
+
+    snew(pmenodes, cr->npmenodes);
     n = 0;
-    for(i=0; i<cr->dd->nnodes; i++) {
-        p0 = cr_ddindex2pmeindex(cr,i);
-        p1 = cr_ddindex2pmeindex(cr,i+1);
-        if (i+1 == cr->dd->nnodes || p1 > p0) {
+    for (i = 0; i < cr->dd->nnodes; i++)
+    {
+        p0 = cr_ddindex2pmeindex(cr, i);
+        p1 = cr_ddindex2pmeindex(cr, i+1);
+        if (i+1 == cr->dd->nnodes || p1 > p0)
+        {
             if (debug)
-                fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
+            {
+                fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
+            }
             pmenodes[n] = i + 1 + n;
             n++;
         }
@@ -2142,54 +2168,54 @@ static int *dd_pmenodes(t_commrec *cr)
     return pmenodes;
 }
 
-static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
+static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
 {
     gmx_domdec_t *dd;
-    ivec coords,coords_pme,nc;
-    int  slab;
-    
+    ivec          coords, coords_pme, nc;
+    int           slab;
+
     dd = cr->dd;
     /*
-      if (dd->comm->bCartesian) {
-      gmx_ddindex2xyz(dd->nc,ddindex,coords);
-      dd_coords2pmecoords(dd,coords,coords_pme);
-      copy_ivec(dd->ntot,nc);
-      nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
-      coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
-      
-      slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
-      } else {
-      slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
-      }
-    */
+       if (dd->comm->bCartesian) {
+       gmx_ddindex2xyz(dd->nc,ddindex,coords);
+       dd_coords2pmecoords(dd,coords,coords_pme);
+       copy_ivec(dd->ntot,nc);
+       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
+       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
+
+       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
+       } else {
+       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
+       }
+     */
     coords[XX] = x;
     coords[YY] = y;
     coords[ZZ] = z;
-    slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
-    
+    slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
+
     return slab;
 }
 
-static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
+static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
 {
     gmx_domdec_comm_t *comm;
-    ivec coords;
-    int  ddindex,nodeid=-1;
-    
+    ivec               coords;
+    int                ddindex, nodeid = -1;
+
     comm = cr->dd->comm;
-    
+
     coords[XX] = x;
     coords[YY] = y;
     coords[ZZ] = z;
     if (comm->bCartesianPP_PME)
     {
 #ifdef GMX_MPI
-        MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
+        MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
 #endif
     }
     else
     {
-        ddindex = dd_index(cr->dd->nc,coords);
+        ddindex = dd_index(cr->dd->nc, coords);
         if (comm->bCartesianPP)
         {
             nodeid = comm->ddindex2simnodeid[ddindex];
@@ -2198,7 +2224,7 @@ static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
         {
             if (comm->pmenodes)
             {
-                nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
+                nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
             }
             else
             {
@@ -2206,31 +2232,31 @@ static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
             }
         }
     }
-  
+
     return nodeid;
 }
 
-static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
+static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
 {
-    gmx_domdec_t *dd;
+    gmx_domdec_t      *dd;
     gmx_domdec_comm_t *comm;
-    ivec coord,coord_pme;
-    int  i;
-    int  pmenode=-1;
-    
-    dd = cr->dd;
+    ivec               coord, coord_pme;
+    int                i;
+    int                pmenode = -1;
+
+    dd   = cr->dd;
     comm = dd->comm;
-    
+
     /* This assumes a uniform x domain decomposition grid cell size */
     if (comm->bCartesianPP_PME)
     {
 #ifdef GMX_MPI
-        MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
+        MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
         {
             /* This is a PP node */
-            dd_cart_coord2pmecoord(dd,coord,coord_pme);
-            MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
+            dd_cart_coord2pmecoord(dd, coord, coord_pme);
+            MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
         }
 #endif
     }
@@ -2238,7 +2264,7 @@ static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
     {
         if (sim_nodeid < dd->nnodes)
         {
-            pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
+            pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
         }
     }
     else
@@ -2251,7 +2277,7 @@ static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
             if (sim_nodeid < dd->nnodes)
             {
                 /* The DD index equals the nodeid */
-                pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
+                pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
             }
         }
         else
@@ -2267,163 +2293,182 @@ static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
             }
         }
     }
-    
+
     return pmenode;
 }
 
-gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
+void get_pme_nnodes(const gmx_domdec_t *dd,
+                    int *npmenodes_x, int *npmenodes_y)
+{
+    if (dd != NULL)
+    {
+        *npmenodes_x = dd->comm->npmenodes_x;
+        *npmenodes_y = dd->comm->npmenodes_y;
+    }
+    else
+    {
+        *npmenodes_x = 1;
+        *npmenodes_y = 1;
+    }
+}
+
+gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
 {
     gmx_bool bPMEOnlyNode;
-    
+
     if (DOMAINDECOMP(cr))
     {
-        bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
+        bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
     }
     else
     {
         bPMEOnlyNode = FALSE;
     }
-    
+
     return bPMEOnlyNode;
 }
 
-void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
-                     int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
+void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
+                     int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
 {
     gmx_domdec_t *dd;
-    int x,y,z;
-    ivec coord,coord_pme;
-    
+    int           x, y, z;
+    ivec          coord, coord_pme;
+
     dd = cr->dd;
-    
-    snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
-    
+
+    snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
+
     *nmy_ddnodes = 0;
-    for(x=0; x<dd->nc[XX]; x++)
+    for (x = 0; x < dd->nc[XX]; x++)
     {
-        for(y=0; y<dd->nc[YY]; y++)
+        for (y = 0; y < dd->nc[YY]; y++)
         {
-            for(z=0; z<dd->nc[ZZ]; z++)
+            for (z = 0; z < dd->nc[ZZ]; z++)
             {
                 if (dd->comm->bCartesianPP_PME)
                 {
                     coord[XX] = x;
                     coord[YY] = y;
                     coord[ZZ] = z;
-                    dd_cart_coord2pmecoord(dd,coord,coord_pme);
+                    dd_cart_coord2pmecoord(dd, coord, coord_pme);
                     if (dd->ci[XX] == coord_pme[XX] &&
                         dd->ci[YY] == coord_pme[YY] &&
                         dd->ci[ZZ] == coord_pme[ZZ])
-                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
+                    {
+                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
+                    }
                 }
                 else
                 {
                     /* The slab corresponds to the nodeid in the PME group */
-                    if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
+                    if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
                     {
-                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
+                        (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
                     }
                 }
             }
         }
     }
-    
+
     /* The last PP-only node is the peer node */
     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
-    
+
     if (debug)
     {
-        fprintf(debug,"Receive coordinates from PP nodes:");
-        for(x=0; x<*nmy_ddnodes; x++)
+        fprintf(debug, "Receive coordinates from PP ranks:");
+        for (x = 0; x < *nmy_ddnodes; x++)
         {
-            fprintf(debug," %d",(*my_ddnodes)[x]);
+            fprintf(debug, " %d", (*my_ddnodes)[x]);
         }
-        fprintf(debug,"\n");
+        fprintf(debug, "\n");
     }
 }
 
 static gmx_bool receive_vir_ener(t_commrec *cr)
 {
     gmx_domdec_comm_t *comm;
-    int  pmenode,coords[DIM],rank;
-    gmx_bool bReceive;
-    
+    int                pmenode, coords[DIM], rank;
+    gmx_bool           bReceive;
+
     bReceive = TRUE;
     if (cr->npmenodes < cr->dd->nnodes)
     {
         comm = cr->dd->comm;
         if (comm->bCartesianPP_PME)
         {
-            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
+            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
 #ifdef GMX_MPI
-            MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
+            MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
             coords[comm->cartpmedim]++;
             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
             {
-                MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
-                if (dd_simnode2pmenode(cr,rank) == pmenode)
+                MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
+                if (dd_simnode2pmenode(cr, rank) == pmenode)
                 {
                     /* This is not the last PP node for pmenode */
                     bReceive = FALSE;
                 }
             }
-#endif  
+#endif
         }
         else
         {
-            pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
+            pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
             if (cr->sim_nodeid+1 < cr->nnodes &&
-                dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
+                dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
             {
                 /* This is not the last PP node for pmenode */
                 bReceive = FALSE;
             }
         }
     }
-    
+
     return bReceive;
 }
 
 static void set_zones_ncg_home(gmx_domdec_t *dd)
 {
     gmx_domdec_zones_t *zones;
-    int i;
+    int                 i;
 
     zones = &dd->comm->zones;
 
     zones->cg_range[0] = 0;
-    for(i=1; i<zones->n+1; i++)
+    for (i = 1; i < zones->n+1; i++)
     {
         zones->cg_range[i] = dd->ncg_home;
     }
+    /* zone_ncg1[0] should always be equal to ncg_home */
+    dd->comm->zone_ncg1[0] = dd->ncg_home;
 }
 
 static void rebuild_cgindex(gmx_domdec_t *dd,
-                            const int *gcgs_index,t_state *state)
-{
-    int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
-    
-    ind = state->cg_gl;
-    dd_cg_gl = dd->index_gl;
-    cgindex  = dd->cgindex;
-    nat = 0;
+                            const int *gcgs_index, t_state *state)
+{
+    int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
+
+    ind        = state->cg_gl;
+    dd_cg_gl   = dd->index_gl;
+    cgindex    = dd->cgindex;
+    nat        = 0;
     cgindex[0] = nat;
-    for(i=0; i<state->ncg_gl; i++)
+    for (i = 0; i < state->ncg_gl; i++)
     {
-        cgindex[i] = nat;
-        cg_gl = ind[i];
+        cgindex[i]  = nat;
+        cg_gl       = ind[i];
         dd_cg_gl[i] = cg_gl;
-        nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
+        nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
     }
     cgindex[i] = nat;
-    
+
     dd->ncg_home = state->ncg_gl;
     dd->nat_home = nat;
 
     set_zones_ncg_home(dd);
 }
 
-static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
+static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
 {
     while (cg >= cginfo_mb->cg_end)
     {
@@ -2433,27 +2478,27 @@ static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
 }
 
-static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
-                          t_forcerec *fr,char *bLocalCG)
+static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
+                          t_forcerec *fr, char *bLocalCG)
 {
     cginfo_mb_t *cginfo_mb;
-    int *cginfo;
-    int cg;
+    int         *cginfo;
+    int          cg;
 
     if (fr != NULL)
     {
         cginfo_mb = fr->cginfo_mb;
         cginfo    = fr->cginfo;
 
-        for(cg=cg0; cg<cg1; cg++)
+        for (cg = cg0; cg < cg1; cg++)
         {
-            cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
+            cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
         }
     }
 
     if (bLocalCG != NULL)
     {
-        for(cg=cg0; cg<cg1; cg++)
+        for (cg = cg0; cg < cg1; cg++)
         {
             bLocalCG[index_gl[cg]] = TRUE;
         }
@@ -2461,20 +2506,20 @@ static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
 }
 
 static void make_dd_indices(gmx_domdec_t *dd,
-                            const int *gcgs_index,int cg_start)
+                            const int *gcgs_index, int cg_start)
 {
-    int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
-    int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
+    int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
+    int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
     gmx_ga2la_t *ga2la;
-    char *bLocalCG;
-    gmx_bool bCGs;
+    char        *bLocalCG;
+    gmx_bool     bCGs;
 
     bLocalCG = dd->comm->bLocalCG;
 
     if (dd->nat_tot > dd->gatindex_nalloc)
     {
         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
-        srenew(dd->gatindex,dd->gatindex_nalloc);
+        srenew(dd->gatindex, dd->gatindex_nalloc);
     }
 
     nzone      = dd->comm->zones.n;
@@ -2488,10 +2533,10 @@ static void make_dd_indices(gmx_domdec_t *dd,
     {
         gmx_incons("dd->ncg_zone is not up to date");
     }
-    
+
     /* Make the local to global and global to local atom index */
     a = dd->cgindex[cg_start];
-    for(zone=0; zone<nzone; zone++)
+    for (zone = 0; zone < nzone; zone++)
     {
         if (zone == 0)
         {
@@ -2504,7 +2549,7 @@ static void make_dd_indices(gmx_domdec_t *dd,
         cg1    = zone2cg[zone+1];
         cg1_p1 = cg0 + zone_ncg1[zone];
 
-        for(cg=cg0; cg<cg1; cg++)
+        for (cg = cg0; cg < cg1; cg++)
         {
             zone1 = zone;
             if (cg >= cg1_p1)
@@ -2515,44 +2560,44 @@ static void make_dd_indices(gmx_domdec_t *dd,
             cg_gl = index_gl[cg];
             if (bCGs)
             {
-                for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
+                for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
                 {
                     gatindex[a] = a_gl;
-                    ga2la_set(dd->ga2la,a_gl,a,zone1);
+                    ga2la_set(dd->ga2la, a_gl, a, zone1);
                     a++;
                 }
             }
             else
             {
                 gatindex[a] = cg_gl;
-                ga2la_set(dd->ga2la,cg_gl,a,zone1);
+                ga2la_set(dd->ga2la, cg_gl, a, zone1);
                 a++;
             }
         }
     }
 }
 
-static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
+static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
                           const char *where)
 {
-    int ncg,i,ngl,nerr;
+    int ncg, i, ngl, nerr;
 
     nerr = 0;
     if (bLocalCG == NULL)
     {
         return nerr;
     }
-    for(i=0; i<dd->ncg_tot; i++)
+    for (i = 0; i < dd->ncg_tot; i++)
     {
         if (!bLocalCG[dd->index_gl[i]])
         {
             fprintf(stderr,
-                    "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
+                    "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
             nerr++;
         }
     }
     ngl = 0;
-    for(i=0; i<ncg_sys; i++)
+    for (i = 0; i < ncg_sys; i++)
     {
         if (bLocalCG[i])
         {
@@ -2561,7 +2606,7 @@ static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
     }
     if (ngl != dd->ncg_tot)
     {
-        fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
+        fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
         nerr++;
     }
 
@@ -2569,22 +2614,22 @@ static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
 }
 
 static void check_index_consistency(gmx_domdec_t *dd,
-                                    int natoms_sys,int ncg_sys,
+                                    int natoms_sys, int ncg_sys,
                                     const char *where)
 {
-    int  nerr,ngl,i,a,cell;
+    int   nerr, ngl, i, a, cell;
     int  *have;
 
     nerr = 0;
 
     if (dd->comm->DD_debug > 1)
     {
-        snew(have,natoms_sys);
-        for(a=0; a<dd->nat_tot; a++)
+        snew(have, natoms_sys);
+        for (a = 0; a < dd->nat_tot; a++)
         {
             if (have[dd->gatindex[a]] > 0)
             {
-                fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
+                fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
             }
             else
             {
@@ -2594,16 +2639,16 @@ static void check_index_consistency(gmx_domdec_t *dd,
         sfree(have);
     }
 
-    snew(have,dd->nat_tot);
+    snew(have, dd->nat_tot);
 
     ngl  = 0;
-    for(i=0; i<natoms_sys; i++)
+    for (i = 0; i < natoms_sys; i++)
     {
-        if (ga2la_get(dd->ga2la,i,&a,&cell))
+        if (ga2la_get(dd->ga2la, i, &a, &cell))
         {
             if (a >= dd->nat_tot)
             {
-                fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
+                fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
                 nerr++;
             }
             else
@@ -2611,7 +2656,7 @@ static void check_index_consistency(gmx_domdec_t *dd,
                 have[a] = 1;
                 if (dd->gatindex[a] != i)
                 {
-                    fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
+                    fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
                     nerr++;
                 }
             }
@@ -2621,31 +2666,32 @@ static void check_index_consistency(gmx_domdec_t *dd,
     if (ngl != dd->nat_tot)
     {
         fprintf(stderr,
-                "DD node %d, %s: %d global atom indices, %d local atoms\n",
-                dd->rank,where,ngl,dd->nat_tot);
+                "DD rank %d, %s: %d global atom indices, %d local atoms\n",
+                dd->rank, where, ngl, dd->nat_tot);
     }
-    for(a=0; a<dd->nat_tot; a++)
+    for (a = 0; a < dd->nat_tot; a++)
     {
         if (have[a] == 0)
         {
             fprintf(stderr,
-                    "DD node %d, %s: local atom %d, global %d has no global index\n",
-                    dd->rank,where,a+1,dd->gatindex[a]+1);
+                    "DD rank %d, %s: local atom %d, global %d has no global index\n",
+                    dd->rank, where, a+1, dd->gatindex[a]+1);
         }
     }
     sfree(have);
 
-    nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
+    nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
 
-    if (nerr > 0) {
-        gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
-                  dd->rank,where,nerr);
+    if (nerr > 0)
+    {
+        gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
+                  dd->rank, where, nerr);
     }
 }
 
-static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
+static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
 {
-    int  i;
+    int   i;
     char *bLocalCG;
 
     if (a_start == 0)
@@ -2655,30 +2701,58 @@ static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
     }
     else
     {
-        for(i=a_start; i<dd->nat_tot; i++)
+        for (i = a_start; i < dd->nat_tot; i++)
         {
-            ga2la_del(dd->ga2la,dd->gatindex[i]);
+            ga2la_del(dd->ga2la, dd->gatindex[i]);
         }
     }
 
     bLocalCG = dd->comm->bLocalCG;
     if (bLocalCG)
     {
-        for(i=cg_start; i<dd->ncg_tot; i++)
+        for (i = cg_start; i < dd->ncg_tot; i++)
         {
             bLocalCG[dd->index_gl[i]] = FALSE;
         }
     }
 
     dd_clear_local_vsite_indices(dd);
-    
+
     if (dd->constraints)
     {
         dd_clear_local_constraint_indices(dd);
     }
 }
 
-static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
+/* This function should be used for moving the domain boudaries during DLB,
+ * for obtaining the minimum cell size. It checks the initially set limit
+ * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
+ * and, possibly, a longer cut-off limit set for PME load balancing.
+ */
+static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
+{
+    real cellsize_min;
+
+    cellsize_min = comm->cellsize_min[dim];
+
+    if (!comm->bVacDLBNoLimit)
+    {
+        /* The cut-off might have changed, e.g. by PME load balacning,
+         * from the value used to set comm->cellsize_min, so check it.
+         */
+        cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
+
+        if (comm->bPMELoadBalDLBLimits)
+        {
+            /* Check for the cut-off limit set by the PME load balancing */
+            cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
+        }
+    }
+
+    return cellsize_min;
+}
+
+static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
                             int dim_ind)
 {
     real grid_jump_limit;
@@ -2692,6 +2766,10 @@ static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
     grid_jump_limit = comm->cellsize_limit;
     if (!comm->bVacDLBNoLimit)
     {
+        if (comm->bPMELoadBalDLBLimits)
+        {
+            cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
+        }
         grid_jump_limit = max(grid_jump_limit,
                               cutoff/comm->cd[dim_ind].np);
     }
@@ -2699,32 +2777,32 @@ static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
     return grid_jump_limit;
 }
 
-static gmx_bool check_grid_jump(gmx_large_int_t step,
-                                gmx_domdec_t *dd,
-                                real cutoff,
-                                gmx_ddbox_t *ddbox,
-                                gmx_bool bFatal)
+static gmx_bool check_grid_jump(gmx_int64_t     step,
+                                gmx_domdec_t   *dd,
+                                real            cutoff,
+                                gmx_ddbox_t    *ddbox,
+                                gmx_bool        bFatal)
 {
     gmx_domdec_comm_t *comm;
-    int  d,dim;
-    real limit,bfac;
-    gmx_bool bInvalid;
+    int                d, dim;
+    real               limit, bfac;
+    gmx_bool           bInvalid;
 
     bInvalid = FALSE;
 
     comm = dd->comm;
-    
-    for(d=1; d<dd->ndim; d++)
+
+    for (d = 1; d < dd->ndim; d++)
     {
-        dim = dd->dim[d];
-        limit = grid_jump_limit(comm,cutoff,d);
-        bfac = ddbox->box_size[dim];
+        dim   = dd->dim[d];
+        limit = grid_jump_limit(comm, cutoff, d);
+        bfac  = ddbox->box_size[dim];
         if (ddbox->tric_dir[dim])
         {
             bfac *= ddbox->skew_fac[dim];
         }
         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
-            (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
+                                                              (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
         {
             bInvalid = TRUE;
 
@@ -2735,9 +2813,9 @@ static gmx_bool check_grid_jump(gmx_large_int_t step,
                 /* This error should never be triggered under normal
                  * circumstances, but you never know ...
                  */
-                gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
-                          gmx_step_str(step,buf),
-                          dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
+                gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
+                          gmx_step_str(step, buf),
+                          dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
             }
         }
     }
@@ -2753,7 +2831,7 @@ static int dd_load_count(gmx_domdec_comm_t *comm)
 static float dd_force_load(gmx_domdec_comm_t *comm)
 {
     float load;
-    
+
     if (comm->eFlop)
     {
         load = comm->flop;
@@ -2761,34 +2839,62 @@ static float dd_force_load(gmx_domdec_comm_t *comm)
         {
             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
         }
-    } 
+    }
     else
     {
         load = comm->cycl[ddCyclF];
         if (comm->cycl_n[ddCyclF] > 1)
         {
             /* Subtract the maximum of the last n cycle counts
-             * to get rid of possible high counts due to other soures,
+             * to get rid of possible high counts due to other sources,
              * for instance system activity, that would otherwise
              * affect the dynamic load balancing.
              */
             load -= comm->cycl_max[ddCyclF];
         }
+
+#ifdef GMX_MPI
+        if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
+        {
+            float gpu_wait, gpu_wait_sum;
+
+            gpu_wait = comm->cycl[ddCyclWaitGPU];
+            if (comm->cycl_n[ddCyclF] > 1)
+            {
+                /* We should remove the WaitGPU time of the same MD step
+                 * as the one with the maximum F time, since the F time
+                 * and the wait time are not independent.
+                 * Furthermore, the step for the max F time should be chosen
+                 * the same on all ranks that share the same GPU.
+                 * But to keep the code simple, we remove the average instead.
+                 * The main reason for artificially long times at some steps
+                 * is spurious CPU activity or MPI time, so we don't expect
+                 * that changes in the GPU wait time matter a lot here.
+                 */
+                gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
+            }
+            /* Sum the wait times over the ranks that share the same GPU */
+            MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
+                          comm->mpi_comm_gpu_shared);
+            /* Replace the wait time by the average over the ranks */
+            load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
+        }
+#endif
     }
-    
+
     return load;
 }
 
-static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
+static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
 {
     gmx_domdec_comm_t *comm;
-    int i;
-    
+    int                i;
+
     comm = dd->comm;
-    
-    snew(*dim_f,dd->nc[dim]+1);
+
+    snew(*dim_f, dd->nc[dim]+1);
     (*dim_f)[0] = 0;
-    for(i=1; i<dd->nc[dim]; i++)
+    for (i = 1; i < dd->nc[dim]; i++)
     {
         if (comm->slb_frac[dim])
         {
@@ -2802,11 +2908,11 @@ static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
     (*dim_f)[dd->nc[dim]] = 1;
 }
 
-static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
+static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
 {
-    int         pmeindex,slab,nso,i;
+    int  pmeindex, slab, nso, i;
     ivec xyz;
-    
+
     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
     {
         ddpme->dim = YY;
@@ -2816,7 +2922,7 @@ static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
         ddpme->dim = dimind;
     }
     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
-    
+
     ddpme->nslab = (ddpme->dim == 0 ?
                     dd->comm->npmenodes_x :
                     dd->comm->npmenodes_y);
@@ -2828,30 +2934,36 @@ static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
 
     nso = dd->comm->npmenodes/ddpme->nslab;
     /* Determine for each PME slab the PP location range for dimension dim */
-    snew(ddpme->pp_min,ddpme->nslab);
-    snew(ddpme->pp_max,ddpme->nslab);
-    for(slab=0; slab<ddpme->nslab; slab++) {
+    snew(ddpme->pp_min, ddpme->nslab);
+    snew(ddpme->pp_max, ddpme->nslab);
+    for (slab = 0; slab < ddpme->nslab; slab++)
+    {
         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
         ddpme->pp_max[slab] = 0;
     }
-    for(i=0; i<dd->nnodes; i++) {
-        ddindex2xyz(dd->nc,i,xyz);
+    for (i = 0; i < dd->nnodes; i++)
+    {
+        ddindex2xyz(dd->nc, i, xyz);
         /* For y only use our y/z slab.
          * This assumes that the PME x grid size matches the DD grid size.
          */
-        if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
-            pmeindex = ddindex2pmeindex(dd,i);
-            if (dimind == 0) {
+        if (dimind == 0 || xyz[XX] == dd->ci[XX])
+        {
+            pmeindex = ddindex2pmeindex(dd, i);
+            if (dimind == 0)
+            {
                 slab = pmeindex/nso;
-            } else {
+            }
+            else
+            {
                 slab = pmeindex % ddpme->nslab;
             }
-            ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
-            ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
+            ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
+            ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
         }
     }
 
-    set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
+    set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
 }
 
 int dd_pme_maxshift_x(gmx_domdec_t *dd)
@@ -2882,19 +2994,19 @@ int dd_pme_maxshift_y(gmx_domdec_t *dd)
     }
 }
 
-static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
-                             gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
+static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
+                             gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
 {
     gmx_domdec_comm_t *comm;
-    int  nc,ns,s;
-    int  *xmin,*xmax;
-    real range,pme_boundary;
-    int  sh;
-    
+    int                nc, ns, s;
+    int               *xmin, *xmax;
+    real               range, pme_boundary;
+    int                sh;
+
     comm = dd->comm;
-    nc  = dd->nc[ddpme->dim];
-    ns  = ddpme->nslab;
-    
+    nc   = dd->nc[ddpme->dim];
+    ns   = ddpme->nslab;
+
     if (!ddpme->dim_match)
     {
         /* PP decomposition is not along dim: the worst situation */
@@ -2920,9 +3032,9 @@ static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
         /* Avoid extra communication when we are exactly at a boundary */
         range *= 0.999;
-        
+
         sh = 1;
-        for(s=0; s<ns; s++)
+        for (s = 0; s < ns; s++)
         {
             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
             pme_boundary = (real)s/ns;
@@ -2945,66 +3057,78 @@ static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
             }
         }
     }
-    
+
     ddpme->maxshift = sh;
-    
+
     if (debug)
     {
-        fprintf(debug,"PME slab communication range for dim %d is %d\n",
-                ddpme->dim,ddpme->maxshift);
+        fprintf(debug, "PME slab communication range for dim %d is %d\n",
+                ddpme->dim, ddpme->maxshift);
     }
 }
 
-static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
+static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 {
-    int d,dim;
-    
-    for(d=0; d<dd->ndim; d++)
+    int d, dim;
+
+    for (d = 0; d < dd->ndim; d++)
     {
         dim = dd->dim[d];
         if (dim < ddbox->nboundeddim &&
             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
         {
-            gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
-                      dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
-                      dd->nc[dim],dd->comm->cellsize_limit);
+            gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
+                      dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
+                      dd->nc[dim], dd->comm->cellsize_limit);
         }
     }
 }
 
-static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
-                                  gmx_bool bMaster,ivec npulse)
+enum {
+    setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
+};
+
+/* Set the domain boundaries. Use for static (or no) load balancing,
+ * and also for the starting state for dynamic load balancing.
+ * setmode determine if and where the boundaries are stored, use enum above.
+ * Returns the number communication pulses in npulse.
+ */
+static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
+                                  int setmode, ivec npulse)
 {
     gmx_domdec_comm_t *comm;
-    int  d,j;
-    rvec cellsize_min;
-    real *cell_x,cell_dx,cellsize;
-    
+    int                d, j;
+    rvec               cellsize_min;
+    real              *cell_x, cell_dx, cellsize;
+
     comm = dd->comm;
-    
-    for(d=0; d<DIM; d++)
+
+    for (d = 0; d < DIM; d++)
     {
         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
-        npulse[d] = 1;
+        npulse[d]       = 1;
         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
         {
             /* Uniform grid */
             cell_dx = ddbox->box_size[d]/dd->nc[d];
-            if (bMaster)
-            {
-                for(j=0; j<dd->nc[d]+1; j++)
-                {
-                    dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
-                }
-            }
-            else
+            switch (setmode)
             {
-                comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
-                comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
+                case setcellsizeslbMASTER:
+                    for (j = 0; j < dd->nc[d]+1; j++)
+                    {
+                        dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
+                    }
+                    break;
+                case setcellsizeslbLOCAL:
+                    comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
+                    comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
+                    break;
+                default:
+                    break;
             }
             cellsize = cell_dx*ddbox->skew_fac[d];
-            while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
+            while (cellsize*npulse[d] < comm->cutoff)
             {
                 npulse[d]++;
             }
@@ -3017,31 +3141,34 @@ static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
              * all cell borders in a loop to obtain identical values
              * to the master distribution case and to determine npulse.
              */
-            if (bMaster)
+            if (setmode == setcellsizeslbMASTER)
             {
                 cell_x = dd->ma->cell_x[d];
             }
             else
             {
-                snew(cell_x,dd->nc[d]+1);
+                snew(cell_x, dd->nc[d]+1);
             }
             cell_x[0] = ddbox->box0[d];
-            for(j=0; j<dd->nc[d]; j++)
+            for (j = 0; j < dd->nc[d]; j++)
             {
-                cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
+                cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
                 cell_x[j+1] = cell_x[j] + cell_dx;
-                cellsize = cell_dx*ddbox->skew_fac[d];
+                cellsize    = cell_dx*ddbox->skew_fac[d];
                 while (cellsize*npulse[d] < comm->cutoff &&
                        npulse[d] < dd->nc[d]-1)
                 {
                     npulse[d]++;
                 }
-                cellsize_min[d] = min(cellsize_min[d],cellsize);
+                cellsize_min[d] = min(cellsize_min[d], cellsize);
             }
-            if (!bMaster)
+            if (setmode == setcellsizeslbLOCAL)
             {
                 comm->cell_x0[d] = cell_x[dd->ci[d]];
                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
+            }
+            if (setmode != setcellsizeslbMASTER)
+            {
                 sfree(cell_x);
             }
         }
@@ -3052,43 +3179,54 @@ static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
         if (d < ddbox->npbcdim &&
             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
         {
-            gmx_fatal_collective(FARGS,NULL,dd,
-                                 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
-                                 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
-                                 comm->cutoff,
-                                 dd->nc[d],dd->nc[d],
-                                 dd->nnodes > dd->nc[d] ? "cells" : "processors");
+            char error_string[STRLEN];
+
+            sprintf(error_string,
+                    "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
+                    dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
+                    comm->cutoff,
+                    dd->nc[d], dd->nc[d],
+                    dd->nnodes > dd->nc[d] ? "cells" : "ranks");
+
+            if (setmode == setcellsizeslbLOCAL)
+            {
+                gmx_fatal_collective(FARGS, NULL, dd, error_string);
+            }
+            else
+            {
+                gmx_fatal(FARGS, error_string);
+            }
         }
     }
-    
+
     if (!comm->bDynLoadBal)
     {
-        copy_rvec(cellsize_min,comm->cellsize_min);
+        copy_rvec(cellsize_min, comm->cellsize_min);
     }
-   
-    for(d=0; d<comm->npmedecompdim; d++)
+
+    for (d = 0; d < comm->npmedecompdim; d++)
     {
-        set_pme_maxshift(dd,&comm->ddpme[d],
-                         comm->slb_frac[dd->dim[d]]==NULL,ddbox,
+        set_pme_maxshift(dd, &comm->ddpme[d],
+                         comm->slb_frac[dd->dim[d]] == NULL, ddbox,
                          comm->ddpme[d].slb_dim_f);
     }
 }
 
 
 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
-                                       int d,int dim,gmx_domdec_root_t *root,
-                                       gmx_ddbox_t *ddbox,
-                                       gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
+                                                  int d, int dim, gmx_domdec_root_t *root,
+                                                  gmx_ddbox_t *ddbox,
+                                                  gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
 {
     gmx_domdec_comm_t *comm;
-    int  ncd,i,j,nmin,nmin_old;
-    gmx_bool bLimLo,bLimHi;
-    real *cell_size;
-    real fac,halfway,cellsize_limit_f_i,region_size;
-    gmx_bool bPBC,bLastHi=FALSE;
-    int nrange[]={range[0],range[1]};
+    int                ncd, i, j, nmin, nmin_old;
+    gmx_bool           bLimLo, bLimHi;
+    real              *cell_size;
+    real               fac, halfway, cellsize_limit_f_i, region_size;
+    gmx_bool           bPBC, bLastHi = FALSE;
+    int                nrange[] = {range[0], range[1]};
 
-    region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
+    region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
 
     comm = dd->comm;
 
@@ -3098,9 +3236,9 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 
     cell_size = root->buf_ncd;
 
-    if (debug) 
+    if (debug)
     {
-        fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
+        fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
     }
 
     /* First we need to check if the scaling does not make cells
@@ -3109,7 +3247,7 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
      * it needs to be enlarged, which makes all the other cells smaller,
      * which could in turn make another cell smaller than allowed.
      */
-    for(i=range[0]; i<range[1]; i++)
+    for (i = range[0]; i < range[1]; i++)
     {
         root->bCellMin[i] = FALSE;
     }
@@ -3119,7 +3257,7 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
         nmin_old = nmin;
         /* We need the total for normalization */
         fac = 0;
-        for(i=range[0]; i<range[1]; i++)
+        for (i = range[0]; i < range[1]; i++)
         {
             if (root->bCellMin[i] == FALSE)
             {
@@ -3128,7 +3266,7 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
         }
         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
         /* Determine the cell boundaries */
-        for(i=range[0]; i<range[1]; i++)
+        for (i = range[0]; i < range[1]; i++)
         {
             if (root->bCellMin[i] == FALSE)
             {
@@ -3144,7 +3282,7 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
                 if (cell_size[i] < cellsize_limit_f_i)
                 {
                     root->bCellMin[i] = TRUE;
-                    cell_size[i] = cellsize_limit_f_i;
+                    cell_size[i]      = cellsize_limit_f_i;
                     nmin++;
                 }
             }
@@ -3152,8 +3290,8 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
         }
     }
     while (nmin > nmin_old);
-    
-    i=range[1]-1;
+
+    i            = range[1]-1;
     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
     /* For this check we should not use DD_CELL_MARGIN,
      * but a slightly smaller factor,
@@ -3162,14 +3300,14 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
     {
         char buf[22];
-        gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
-                  gmx_step_str(step,buf),
-                  dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
-                  ncd,comm->cellsize_min[dim]);
-    }
-    
-    root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
-    
+        gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
+                  gmx_step_str(step, buf),
+                  dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
+                  ncd, comm->cellsize_min[dim]);
+    }
+
+    root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
+
     if (!bUniform)
     {
         /* Check if the boundary did not displace more than halfway
@@ -3180,17 +3318,19 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
          * might be affected by a change and if the old state was ok,
          * the cells will at most be shrunk back to their old size.
          */
-        for(i=range[0]+1; i<range[1]; i++)
+        for (i = range[0]+1; i < range[1]; i++)
         {
             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
             if (root->cell_f[i] < halfway)
             {
                 root->cell_f[i] = halfway;
                 /* Check if the change also causes shifts of the next boundaries */
-                for(j=i+1; j<range[1]; j++)
+                for (j = i+1; j < range[1]; j++)
                 {
                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
+                    {
                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
+                    }
                 }
             }
             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
@@ -3198,15 +3338,17 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
             {
                 root->cell_f[i] = halfway;
                 /* Check if the change also causes shifts of the next boundaries */
-                for(j=i-1; j>=range[0]+1; j--)
+                for (j = i-1; j >= range[0]+1; j--)
                 {
                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
+                    {
                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
+                    }
                 }
             }
         }
     }
-    
+
     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
@@ -3216,7 +3358,7 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
         /* Take care of the staggering of the cell boundaries */
         if (bUniform)
         {
-            for(i=range[0]; i<range[1]; i++)
+            for (i = range[0]; i < range[1]; i++)
             {
                 root->cell_f_max0[i] = root->cell_f[i];
                 root->cell_f_min1[i] = root->cell_f[i+1];
@@ -3224,7 +3366,7 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
         }
         else
         {
-            for(i=range[0]+1; i<range[1]; i++)
+            for (i = range[0]+1; i < range[1]; i++)
             {
                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
@@ -3233,12 +3375,12 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
                     /* Both limits violated, try the best we can */
                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
-                    nrange[0]=range[0];
-                    nrange[1]=i;
+                    nrange[0]       = range[0];
+                    nrange[1]       = i;
                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 
-                    nrange[0]=i;
-                    nrange[1]=range[1];
+                    nrange[0] = i;
+                    nrange[1] = range[1];
                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
 
                     return;
@@ -3246,33 +3388,33 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
                 else if (bLimLo)
                 {
                     /* root->cell_f[i] = root->bound_min[i]; */
-                    nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
-                    bLastHi=FALSE;
+                    nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
+                    bLastHi   = FALSE;
                 }
                 else if (bLimHi && !bLastHi)
                 {
-                    bLastHi=TRUE;
+                    bLastHi = TRUE;
                     if (nrange[1] < range[1])   /* found a LimLo before */
                     {
                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-                        nrange[0]=nrange[1];
+                        nrange[0] = nrange[1];
                     }
                     root->cell_f[i] = root->bound_max[i];
-                    nrange[1]=i; 
+                    nrange[1]       = i;
                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-                    nrange[0]=i;
-                    nrange[1]=range[1];
+                    nrange[0] = i;
+                    nrange[1] = range[1];
                 }
             }
             if (nrange[1] < range[1])   /* found last a LimLo */
             {
                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-                nrange[0]=nrange[1];
-                nrange[1]=range[1];
+                nrange[0] = nrange[1];
+                nrange[1] = range[1];
                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
-            } 
+            }
             else if (nrange[0] > range[0]) /* found at least one LimHi */
             {
                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
@@ -3283,19 +3425,19 @@ static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
 
 
 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
-                                       int d,int dim,gmx_domdec_root_t *root,
-                                       gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
-                                       gmx_bool bUniform,gmx_large_int_t step)
+                                       int d, int dim, gmx_domdec_root_t *root,
+                                       gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
+                                       gmx_bool bUniform, gmx_int64_t step)
 {
     gmx_domdec_comm_t *comm;
-    int  ncd,d1,i,j,pos;
-    real *cell_size;
-    real load_aver,load_i,imbalance,change,change_max,sc;
-    real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
-    real change_limit;
-    real relax = 0.5;
-    gmx_bool bPBC;
-    int range[] = { 0, 0 };
+    int                ncd, d1, i, j, pos;
+    real              *cell_size;
+    real               load_aver, load_i, imbalance, change, change_max, sc;
+    real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
+    real               change_limit;
+    real               relax = 0.5;
+    gmx_bool           bPBC;
+    int                range[] = { 0, 0 };
 
     comm = dd->comm;
 
@@ -3309,28 +3451,29 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
     cell_size = root->buf_ncd;
 
     /* Store the original boundaries */
-    for(i=0; i<ncd+1; i++)
+    for (i = 0; i < ncd+1; i++)
     {
         root->old_cell_f[i] = root->cell_f[i];
     }
-    if (bUniform) {
-        for(i=0; i<ncd; i++)
+    if (bUniform)
+    {
+        for (i = 0; i < ncd; i++)
         {
             cell_size[i] = 1.0/ncd;
         }
     }
     else if (dd_load_count(comm))
     {
-        load_aver = comm->load[d].sum_m/ncd;
+        load_aver  = comm->load[d].sum_m/ncd;
         change_max = 0;
-        for(i=0; i<ncd; i++)
+        for (i = 0; i < ncd; i++)
         {
             /* Determine the relative imbalance of cell i */
-            load_i = comm->load[d].load[i*comm->load[d].nload+2];
-            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
+            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
+            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
             /* Determine the change of the cell size using underrelaxation */
-            change = -relax*imbalance;
-            change_max = max(change_max,max(change,-change));
+            change     = -relax*imbalance;
+            change_max = max(change_max, max(change, -change));
         }
         /* Limit the amount of scaling.
          * We need to use the same rescaling for all cells in one row,
@@ -3341,20 +3484,20 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
         {
             sc *= change_limit/change_max;
         }
-        for(i=0; i<ncd; i++)
+        for (i = 0; i < ncd; i++)
         {
             /* Determine the relative imbalance of cell i */
-            load_i = comm->load[d].load[i*comm->load[d].nload+2];
-            imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
+            load_i    = comm->load[d].load[i*comm->load[d].nload+2];
+            imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
             /* Determine the change of the cell size using underrelaxation */
-            change = -sc*imbalance;
+            change       = -sc*imbalance;
             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
         }
     }
-    
-    cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
+
+    cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
     cellsize_limit_f *= DD_CELL_MARGIN;
-    dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
+    dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
     if (ddbox->tric_dir[dim])
     {
@@ -3368,34 +3511,37 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
     if (d > 0 && !bUniform)
     {
         /* Make sure that the grid is not shifted too much */
-        for(i=1; i<ncd; i++) {
-            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
+        for (i = 1; i < ncd; i++)
+        {
+            if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
             {
                 gmx_incons("Inconsistent DD boundary staggering limits!");
             }
             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
-            space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
-            if (space > 0) {
+            space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
+            if (space > 0)
+            {
                 root->bound_min[i] += 0.5*space;
             }
             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
-            space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
-            if (space < 0) {
+            space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
+            if (space < 0)
+            {
                 root->bound_max[i] += 0.5*space;
             }
             if (debug)
             {
                 fprintf(debug,
                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
-                        d,i,
+                        d, i,
                         root->cell_f_max0[i-1] + dist_min_f,
-                        root->bound_min[i],root->cell_f[i],root->bound_max[i],
+                        root->bound_min[i], root->cell_f[i], root->bound_max[i],
                         root->cell_f_min1[i] - dist_min_f);
             }
         }
     }
-    range[1]=ncd;
-    root->cell_f[0] = 0;
+    range[1]          = ncd;
+    root->cell_f[0]   = 0;
     root->cell_f[ncd] = 1;
     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
 
@@ -3403,12 +3549,12 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
     /* After the checks above, the cells should obey the cut-off
      * restrictions, but it does not hurt to check.
      */
-    for(i=0; i<ncd; i++)
+    for (i = 0; i < ncd; i++)
     {
         if (debug)
         {
-            fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
-                    dim,i,root->cell_f[i],root->cell_f[i+1]);
+            fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
+                    dim, i, root->cell_f[i], root->cell_f[i+1]);
         }
 
         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
@@ -3418,44 +3564,44 @@ static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
             char buf[22];
             fprintf(stderr,
                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
-                    gmx_step_str(step,buf),dim2char(dim),i,
+                    gmx_step_str(step, buf), dim2char(dim), i,
                     (root->cell_f[i+1] - root->cell_f[i])
                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
         }
     }
-    
+
     pos = ncd + 1;
     /* Store the cell boundaries of the lower dimensions at the end */
-    for(d1=0; d1<d; d1++)
+    for (d1 = 0; d1 < d; d1++)
     {
         root->cell_f[pos++] = comm->cell_f0[d1];
         root->cell_f[pos++] = comm->cell_f1[d1];
     }
-    
+
     if (d < comm->npmedecompdim)
     {
         /* The master determines the maximum shift for
          * the coordinate communication between separate PME nodes.
          */
-        set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
+        set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
     }
     root->cell_f[pos++] = comm->ddpme[0].maxshift;
     if (d >= 1)
     {
         root->cell_f[pos++] = comm->ddpme[1].maxshift;
     }
-}    
+}
 
 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
-                                             gmx_ddbox_t *ddbox,int dimind)
+                                             gmx_ddbox_t *ddbox, int dimind)
 {
     gmx_domdec_comm_t *comm;
-    int dim;
+    int                dim;
 
     comm = dd->comm;
 
     /* Set the cell dimensions */
-    dim = dd->dim[dimind];
+    dim                = dd->dim[dimind];
     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
     if (dim >= ddbox->nboundeddim)
@@ -3466,11 +3612,11 @@ static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
 }
 
 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
-                                         int d,int dim,real *cell_f_row,
+                                         int d, int dim, real *cell_f_row,
                                          gmx_ddbox_t *ddbox)
 {
     gmx_domdec_comm_t *comm;
-    int d1,dim1,pos;
+    int                d1, dim1, pos;
 
     comm = dd->comm;
 
@@ -3478,15 +3624,15 @@ static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
     /* Each node would only need to know two fractions,
      * but it is probably cheaper to broadcast the whole array.
      */
-    MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
-              0,comm->mpi_comm_load[d]);
+    MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
+              0, comm->mpi_comm_load[d]);
 #endif
     /* Copy the fractions for this dimension from the buffer */
     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
     /* The whole array was communicated, so set the buffer position */
     pos = dd->nc[dim] + 1;
-    for(d1=0; d1<=d; d1++)
+    for (d1 = 0; d1 <= d; d1++)
     {
         if (d1 < d)
         {
@@ -3494,7 +3640,7 @@ static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
             comm->cell_f0[d1] = cell_f_row[pos++];
             comm->cell_f1[d1] = cell_f_row[pos++];
         }
-        relative_to_absolute_cell_bounds(dd,ddbox,d1);
+        relative_to_absolute_cell_bounds(dd, ddbox, d1);
     }
     /* Convert the communicated shift from float to int */
     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
@@ -3505,26 +3651,26 @@ static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
 }
 
 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
-                                         gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
-                                         gmx_bool bUniform,gmx_large_int_t step)
+                                         gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
+                                         gmx_bool bUniform, gmx_int64_t step)
 {
     gmx_domdec_comm_t *comm;
-    int d,dim,d1;
-    gmx_bool bRowMember,bRowRoot;
-    real *cell_f_row;
-    
+    int                d, dim, d1;
+    gmx_bool           bRowMember, bRowRoot;
+    real              *cell_f_row;
+
     comm = dd->comm;
 
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
-        dim = dd->dim[d];
+        dim        = dd->dim[d];
         bRowMember = TRUE;
-        bRowRoot = TRUE;
-        for(d1=d; d1<dd->ndim; d1++)
+        bRowRoot   = TRUE;
+        for (d1 = d; d1 < dd->ndim; d1++)
         {
             if (dd->ci[dd->dim[d1]] > 0)
             {
-                if (d1 > d)
+                if (d1 != d)
                 {
                     bRowMember = FALSE;
                 }
@@ -3535,20 +3681,20 @@ static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
         {
             if (bRowRoot)
             {
-                set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
-                                           ddbox,bDynamicBox,bUniform,step);
+                set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
+                                           ddbox, bDynamicBox, bUniform, step);
                 cell_f_row = comm->root[d]->cell_f;
             }
             else
             {
                 cell_f_row = comm->cell_f_row;
             }
-            distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
+            distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
         }
     }
-}    
+}
 
-static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
+static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 {
     int d;
 
@@ -3556,38 +3702,40 @@ static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
      * not be called when the box has changed since the last
      * call to dd_partition_system.
      */
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
-        relative_to_absolute_cell_bounds(dd,ddbox,d); 
+        relative_to_absolute_cell_bounds(dd, ddbox, d);
     }
 }
 
 
 
 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
-                                  gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
-                                  gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
+                                  gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
+                                  gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
                                   gmx_wallcycle_t wcycle)
 {
     gmx_domdec_comm_t *comm;
-    int dim;
+    int                dim;
 
     comm = dd->comm;
-    
+
     if (bDoDLB)
     {
-        wallcycle_start(wcycle,ewcDDCOMMBOUND);
-        set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
-        wallcycle_stop(wcycle,ewcDDCOMMBOUND);
+        wallcycle_start(wcycle, ewcDDCOMMBOUND);
+        set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
+        wallcycle_stop(wcycle, ewcDDCOMMBOUND);
     }
     else if (bDynamicBox)
     {
-        set_dd_cell_sizes_dlb_nochange(dd,ddbox);
+        set_dd_cell_sizes_dlb_nochange(dd, ddbox);
     }
-    
+
     /* Set the dimensions for which no DD is used */
-    for(dim=0; dim<DIM; dim++) {
-        if (dd->nc[dim] == 1) {
+    for (dim = 0; dim < DIM; dim++)
+    {
+        if (dd->nc[dim] == 1)
+        {
             comm->cell_x0[dim] = 0;
             comm->cell_x1[dim] = ddbox->box_size[dim];
             if (dim >= ddbox->nboundeddim)
@@ -3599,12 +3747,12 @@ static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
     }
 }
 
-static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
+static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
 {
-    int d,np,i;
+    int                    d, np, i;
     gmx_domdec_comm_dim_t *cd;
-    
-    for(d=0; d<dd->ndim; d++)
+
+    for (d = 0; d < dd->ndim; d++)
     {
         cd = &dd->comm->cd[d];
         np = npulse[dd->dim[d]];
@@ -3612,15 +3760,15 @@ static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
         {
             if (debug)
             {
-                fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
-                        dim2char(dd->dim[d]),np);
+                fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
+                        dim2char(dd->dim[d]), np);
             }
             if (DDMASTER(dd) && cd->np_nalloc > 0)
             {
-                fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
+                fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
             }
-            srenew(cd->ind,np);
-            for(i=cd->np_nalloc; i<np; i++)
+            srenew(cd->ind, np);
+            for (i = cd->np_nalloc; i < np; i++)
             {
                 cd->ind[i].index  = NULL;
                 cd->ind[i].nalloc = 0;
@@ -3633,87 +3781,87 @@ static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
 
 
 static void set_dd_cell_sizes(gmx_domdec_t *dd,
-                              gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
-                              gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
+                              gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
+                              gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
                               gmx_wallcycle_t wcycle)
 {
     gmx_domdec_comm_t *comm;
-    int  d;
-    ivec npulse;
-    
+    int                d;
+    ivec               npulse;
+
     comm = dd->comm;
 
     /* Copy the old cell boundaries for the cg displacement check */
-    copy_rvec(comm->cell_x0,comm->old_cell_x0);
-    copy_rvec(comm->cell_x1,comm->old_cell_x1);
-    
+    copy_rvec(comm->cell_x0, comm->old_cell_x0);
+    copy_rvec(comm->cell_x1, comm->old_cell_x1);
+
     if (comm->bDynLoadBal)
     {
         if (DDMASTER(dd))
         {
-            check_box_size(dd,ddbox);
+            check_box_size(dd, ddbox);
         }
-        set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
+        set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
     }
     else
     {
-        set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
-        realloc_comm_ind(dd,npulse);
+        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
+        realloc_comm_ind(dd, npulse);
     }
-    
+
     if (debug)
     {
-        for(d=0; d<DIM; d++)
+        for (d = 0; d < DIM; d++)
         {
-            fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
-                    d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
+            fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
+                    d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
         }
     }
 }
 
 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
                                   gmx_ddbox_t *ddbox,
-                                  rvec cell_ns_x0,rvec cell_ns_x1,
-                                  gmx_large_int_t step)
+                                  rvec cell_ns_x0, rvec cell_ns_x1,
+                                  gmx_int64_t step)
 {
     gmx_domdec_comm_t *comm;
-    int dim_ind,dim;
-    
+    int                dim_ind, dim;
+
     comm = dd->comm;
 
-    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
+    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
     {
         dim = dd->dim[dim_ind];
-        
+
         /* Without PBC we don't have restrictions on the outer cells */
-        if (!(dim >= ddbox->npbcdim && 
+        if (!(dim >= ddbox->npbcdim &&
               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
             comm->bDynLoadBal &&
             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
             comm->cellsize_min[dim])
         {
             char buf[22];
-            gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
-                      gmx_step_str(step,buf),dim2char(dim),
+            gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
+                      gmx_step_str(step, buf), dim2char(dim),
                       comm->cell_x1[dim] - comm->cell_x0[dim],
                       ddbox->skew_fac[dim],
                       dd->comm->cellsize_min[dim],
-                      dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
+                      dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
         }
     }
-    
+
     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
     {
         /* Communicate the boundaries and update cell_ns_x0/1 */
-        dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
+        dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
         if (dd->bGridJump && dd->ndim > 1)
         {
-            check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
+            check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
         }
     }
 }
 
-static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
+static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
 {
     if (YY < npbcdim)
     {
@@ -3740,80 +3888,81 @@ static void check_screw_box(matrix box)
     /* Mathematical limitation */
     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
     {
-        gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
+        gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
     }
-    
+
     /* Limitation due to the asymmetry of the eighth shell method */
     if (box[ZZ][YY] != 0)
     {
-        gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
+        gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
     }
 }
 
-static void distribute_cg(FILE *fplog,gmx_large_int_t step,
-                          matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
+static void distribute_cg(FILE *fplog, gmx_int64_t step,
+                          matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
                           gmx_domdec_t *dd)
 {
     gmx_domdec_master_t *ma;
-    int **tmp_ind=NULL,*tmp_nalloc=NULL;
-    int  i,icg,j,k,k0,k1,d,npbcdim;
-    matrix tcm;
-    rvec box_size,cg_cm;
-    ivec ind;
-    real nrcg,inv_ncg,pos_d;
-    atom_id *cgindex;
-    gmx_bool bUnbounded,bScrew;
+    int                **tmp_ind = NULL, *tmp_nalloc = NULL;
+    int                  i, icg, j, k, k0, k1, d, npbcdim;
+    matrix               tcm;
+    rvec                 box_size, cg_cm;
+    ivec                 ind;
+    real                 nrcg, inv_ncg, pos_d;
+    atom_id             *cgindex;
+    gmx_bool             bUnbounded, bScrew;
 
     ma = dd->ma;
-    
+
     if (tmp_ind == NULL)
     {
-        snew(tmp_nalloc,dd->nnodes);
-        snew(tmp_ind,dd->nnodes);
-        for(i=0; i<dd->nnodes; i++)
+        snew(tmp_nalloc, dd->nnodes);
+        snew(tmp_ind, dd->nnodes);
+        for (i = 0; i < dd->nnodes; i++)
         {
             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
-            snew(tmp_ind[i],tmp_nalloc[i]);
+            snew(tmp_ind[i], tmp_nalloc[i]);
         }
     }
-    
+
     /* Clear the count */
-    for(i=0; i<dd->nnodes; i++)
+    for (i = 0; i < dd->nnodes; i++)
     {
         ma->ncg[i] = 0;
         ma->nat[i] = 0;
     }
-    
-    make_tric_corr_matrix(dd->npbcdim,box,tcm);
-    
+
+    make_tric_corr_matrix(dd->npbcdim, box, tcm);
+
     cgindex = cgs->index;
-    
+
     /* Compute the center of geometry for all charge groups */
-    for(icg=0; icg<cgs->nr; icg++)
+    for (icg = 0; icg < cgs->nr; icg++)
     {
         k0      = cgindex[icg];
         k1      = cgindex[icg+1];
         nrcg    = k1 - k0;
         if (nrcg == 1)
         {
-            copy_rvec(pos[k0],cg_cm);
+            copy_rvec(pos[k0], cg_cm);
         }
         else
         {
             inv_ncg = 1.0/nrcg;
-            
+
             clear_rvec(cg_cm);
-            for(k=k0; (k<k1); k++)
+            for (k = k0; (k < k1); k++)
             {
-                rvec_inc(cg_cm,pos[k]);
+                rvec_inc(cg_cm, pos[k]);
             }
-            for(d=0; (d<DIM); d++)
+            for (d = 0; (d < DIM); d++)
             {
                 cg_cm[d] *= inv_ncg;
             }
         }
         /* Put the charge group in the box and determine the cell index */
-        for(d=DIM-1; d>=0; d--) {
+        for (d = DIM-1; d >= 0; d--)
+        {
             pos_d = cg_cm[d];
             if (d < dd->npbcdim)
             {
@@ -3821,23 +3970,23 @@ static void distribute_cg(FILE *fplog,gmx_large_int_t step,
                 if (tric_dir[d] && dd->nc[d] > 1)
                 {
                     /* Use triclinic coordintates for this dimension */
-                    for(j=d+1; j<DIM; j++)
+                    for (j = d+1; j < DIM; j++)
                     {
                         pos_d += cg_cm[j]*tcm[j][d];
                     }
                 }
-                while(pos_d >= box[d][d])
+                while (pos_d >= box[d][d])
                 {
                     pos_d -= box[d][d];
-                    rvec_dec(cg_cm,box[d]);
+                    rvec_dec(cg_cm, box[d]);
                     if (bScrew)
                     {
                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
                     }
-                    for(k=k0; (k<k1); k++)
+                    for (k = k0; (k < k1); k++)
                     {
-                        rvec_dec(pos[k],box[d]);
+                        rvec_dec(pos[k], box[d]);
                         if (bScrew)
                         {
                             pos[k][YY] = box[YY][YY] - pos[k][YY];
@@ -3845,19 +3994,20 @@ static void distribute_cg(FILE *fplog,gmx_large_int_t step,
                         }
                     }
                 }
-                while(pos_d < 0)
+                while (pos_d < 0)
                 {
                     pos_d += box[d][d];
-                    rvec_inc(cg_cm,box[d]);
+                    rvec_inc(cg_cm, box[d]);
                     if (bScrew)
                     {
                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
                     }
-                    for(k=k0; (k<k1); k++)
+                    for (k = k0; (k < k1); k++)
                     {
-                        rvec_inc(pos[k],box[d]);
-                        if (bScrew) {
+                        rvec_inc(pos[k], box[d]);
+                        if (bScrew)
+                        {
                             pos[k][YY] = box[YY][YY] - pos[k][YY];
                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
                         }
@@ -3866,75 +4016,76 @@ static void distribute_cg(FILE *fplog,gmx_large_int_t step,
             }
             /* This could be done more efficiently */
             ind[d] = 0;
-            while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
+            while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
             {
                 ind[d]++;
             }
         }
-        i = dd_index(dd->nc,ind);
+        i = dd_index(dd->nc, ind);
         if (ma->ncg[i] == tmp_nalloc[i])
         {
             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
-            srenew(tmp_ind[i],tmp_nalloc[i]);
+            srenew(tmp_ind[i], tmp_nalloc[i]);
         }
         tmp_ind[i][ma->ncg[i]] = icg;
         ma->ncg[i]++;
         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
     }
-    
+
     k1 = 0;
-    for(i=0; i<dd->nnodes; i++)
+    for (i = 0; i < dd->nnodes; i++)
     {
         ma->index[i] = k1;
-        for(k=0; k<ma->ncg[i]; k++)
+        for (k = 0; k < ma->ncg[i]; k++)
         {
             ma->cg[k1++] = tmp_ind[i][k];
         }
     }
     ma->index[dd->nnodes] = k1;
-    
-    for(i=0; i<dd->nnodes; i++)
+
+    for (i = 0; i < dd->nnodes; i++)
     {
         sfree(tmp_ind[i]);
     }
     sfree(tmp_ind);
     sfree(tmp_nalloc);
-    
+
     if (fplog)
     {
         char buf[22];
-        fprintf(fplog,"Charge group distribution at step %s:",
-                gmx_step_str(step,buf));
-        for(i=0; i<dd->nnodes; i++)
+        fprintf(fplog, "Charge group distribution at step %s:",
+                gmx_step_str(step, buf));
+        for (i = 0; i < dd->nnodes; i++)
         {
-            fprintf(fplog," %d",ma->ncg[i]);
+            fprintf(fplog, " %d", ma->ncg[i]);
         }
-        fprintf(fplog,"\n");
+        fprintf(fplog, "\n");
     }
 }
 
-static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
-                                t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
+static void get_cg_distribution(FILE *fplog, gmx_int64_t step, gmx_domdec_t *dd,
+                                t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
                                 rvec pos[])
 {
-    gmx_domdec_master_t *ma=NULL;
-    ivec npulse;
-    int  i,cg_gl;
-    int  *ibuf,buf2[2] = { 0, 0 };
-    gmx_bool bMaster = DDMASTER(dd);
+    gmx_domdec_master_t *ma = NULL;
+    ivec                 npulse;
+    int                  i, cg_gl;
+    int                 *ibuf, buf2[2] = { 0, 0 };
+    gmx_bool             bMaster = DDMASTER(dd);
+
     if (bMaster)
     {
         ma = dd->ma;
-        
+
         if (dd->bScrewPBC)
         {
             check_screw_box(box);
         }
-    
-        set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
-    
-        distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
-        for(i=0; i<dd->nnodes; i++)
+
+        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
+
+        distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
+        for (i = 0; i < dd->nnodes; i++)
         {
             ma->ibuf[2*i]   = ma->ncg[i];
             ma->ibuf[2*i+1] = ma->nat[i];
@@ -3945,8 +4096,8 @@ static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *d
     {
         ibuf = NULL;
     }
-    dd_scatter(dd,2*sizeof(int),ibuf,buf2);
-    
+    dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
+
     dd->ncg_home = buf2[0];
     dd->nat_home = buf2[1];
     dd->ncg_tot  = dd->ncg_home;
@@ -3954,87 +4105,89 @@ static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *d
     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
     {
         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
-        srenew(dd->index_gl,dd->cg_nalloc);
-        srenew(dd->cgindex,dd->cg_nalloc+1);
+        srenew(dd->index_gl, dd->cg_nalloc);
+        srenew(dd->cgindex, dd->cg_nalloc+1);
     }
     if (bMaster)
     {
-        for(i=0; i<dd->nnodes; i++)
+        for (i = 0; i < dd->nnodes; i++)
         {
-            ma->ibuf[i] = ma->ncg[i]*sizeof(int);
+            ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
         }
     }
-    
+
     dd_scatterv(dd,
                 DDMASTER(dd) ? ma->ibuf : NULL,
                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
                 DDMASTER(dd) ? ma->cg : NULL,
-                dd->ncg_home*sizeof(int),dd->index_gl);
-    
+                dd->ncg_home*sizeof(int), dd->index_gl);
+
     /* Determine the home charge group sizes */
     dd->cgindex[0] = 0;
-    for(i=0; i<dd->ncg_home; i++)
+    for (i = 0; i < dd->ncg_home; i++)
     {
-        cg_gl = dd->index_gl[i];
+        cg_gl            = dd->index_gl[i];
         dd->cgindex[i+1] =
             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
     }
-    
+
     if (debug)
     {
-        fprintf(debug,"Home charge groups:\n");
-        for(i=0; i<dd->ncg_home; i++)
+        fprintf(debug, "Home charge groups:\n");
+        for (i = 0; i < dd->ncg_home; i++)
         {
-            fprintf(debug," %d",dd->index_gl[i]);
-            if (i % 10 == 9) 
-                fprintf(debug,"\n");
+            fprintf(debug, " %d", dd->index_gl[i]);
+            if (i % 10 == 9)
+            {
+                fprintf(debug, "\n");
+            }
         }
-        fprintf(debug,"\n");
+        fprintf(debug, "\n");
     }
 }
 
-static int compact_and_copy_vec_at(int ncg,int *move,
+static int compact_and_copy_vec_at(int ncg, int *move,
                                    int *cgindex,
-                                   int nvec,int vec,
-                                   rvec *src,gmx_domdec_comm_t *comm,
+                                   int nvec, int vec,
+                                   rvec *src, gmx_domdec_comm_t *comm,
                                    gmx_bool bCompact)
 {
-    int m,icg,i,i0,i1,nrcg;
+    int m, icg, i, i0, i1, nrcg;
     int home_pos;
     int pos_vec[DIM*2];
-    
+
     home_pos = 0;
 
-    for(m=0; m<DIM*2; m++)
+    for (m = 0; m < DIM*2; m++)
     {
         pos_vec[m] = 0;
     }
-    
+
     i0 = 0;
-    for(icg=0; icg<ncg; icg++)
+    for (icg = 0; icg < ncg; icg++)
     {
         i1 = cgindex[icg+1];
-        m = move[icg];
+        m  = move[icg];
         if (m == -1)
         {
             if (bCompact)
             {
                 /* Compact the home array in place */
-                for(i=i0; i<i1; i++)
+                for (i = i0; i < i1; i++)
                 {
-                    copy_rvec(src[i],src[home_pos++]);
+                    copy_rvec(src[i], src[home_pos++]);
                 }
             }
         }
         else
         {
             /* Copy to the communication buffer */
-            nrcg = i1 - i0;
+            nrcg        = i1 - i0;
             pos_vec[m] += 1 + vec*nrcg;
-            for(i=i0; i<i1; i++)
+            for (i = i0; i < i1; i++)
             {
-                copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
+                copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
             }
             pos_vec[m] += (nvec - vec - 1)*nrcg;
         }
@@ -4044,44 +4197,44 @@ static int compact_and_copy_vec_at(int ncg,int *move,
         }
         i0 = i1;
     }
-    
+
     return home_pos;
 }
 
-static int compact_and_copy_vec_cg(int ncg,int *move,
+static int compact_and_copy_vec_cg(int ncg, int *move,
                                    int *cgindex,
-                                   int nvec,rvec *src,gmx_domdec_comm_t *comm,
+                                   int nvec, rvec *src, gmx_domdec_comm_t *comm,
                                    gmx_bool bCompact)
 {
-    int m,icg,i0,i1,nrcg;
+    int m, icg, i0, i1, nrcg;
     int home_pos;
     int pos_vec[DIM*2];
-    
+
     home_pos = 0;
-    
-    for(m=0; m<DIM*2; m++)
+
+    for (m = 0; m < DIM*2; m++)
     {
         pos_vec[m] = 0;
     }
-    
+
     i0 = 0;
-    for(icg=0; icg<ncg; icg++)
+    for (icg = 0; icg < ncg; icg++)
     {
         i1 = cgindex[icg+1];
-        m = move[icg];
+        m  = move[icg];
         if (m == -1)
         {
             if (bCompact)
             {
                 /* Compact the home array in place */
-                copy_rvec(src[icg],src[home_pos++]);
+                copy_rvec(src[icg], src[home_pos++]);
             }
         }
         else
         {
             nrcg = i1 - i0;
             /* Copy to the communication buffer */
-            copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
+            copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
             pos_vec[m] += 1 + nrcg*nvec;
         }
         i0 = i1;
@@ -4090,22 +4243,22 @@ static int compact_and_copy_vec_cg(int ncg,int *move,
     {
         home_pos = ncg;
     }
-    
+
     return home_pos;
 }
 
-static int compact_ind(int ncg,int *move,
-                       int *index_gl,int *cgindex,
+static int compact_ind(int ncg, int *move,
+                       int *index_gl, int *cgindex,
                        int *gatindex,
-                       gmx_ga2la_t ga2la,char *bLocalCG,
+                       gmx_ga2la_t ga2la, char *bLocalCG,
                        int *cginfo)
 {
-    int cg,nat,a0,a1,a,a_gl;
+    int cg, nat, a0, a1, a, a_gl;
     int home_pos;
 
     home_pos = 0;
-    nat = 0;
-    for(cg=0; cg<ncg; cg++)
+    nat      = 0;
+    for (cg = 0; cg < ncg; cg++)
     {
         a0 = cgindex[cg];
         a1 = cgindex[cg+1];
@@ -4115,12 +4268,12 @@ static int compact_ind(int ncg,int *move,
              * Anything that can be done here avoids access to global arrays.
              */
             cgindex[home_pos] = nat;
-            for(a=a0; a<a1; a++)
+            for (a = a0; a < a1; a++)
             {
-                a_gl = gatindex[a];
+                a_gl          = gatindex[a];
                 gatindex[nat] = a_gl;
                 /* The cell number stays 0, so we don't need to set it */
-                ga2la_change_la(ga2la,a_gl,nat);
+                ga2la_change_la(ga2la, a_gl, nat);
                 nat++;
             }
             index_gl[home_pos] = index_gl[cg];
@@ -4131,9 +4284,9 @@ static int compact_ind(int ncg,int *move,
         else
         {
             /* Clear the global indices */
-            for(a=a0; a<a1; a++)
+            for (a = a0; a < a1; a++)
             {
-                ga2la_del(ga2la,gatindex[a]);
+                ga2la_del(ga2la, gatindex[a]);
             }
             if (bLocalCG)
             {
@@ -4142,27 +4295,27 @@ static int compact_ind(int ncg,int *move,
         }
     }
     cgindex[home_pos] = nat;
-    
+
     return home_pos;
 }
 
-static void clear_and_mark_ind(int ncg,int *move,
-                               int *index_gl,int *cgindex,int *gatindex,
-                               gmx_ga2la_t ga2la,char *bLocalCG,
+static void clear_and_mark_ind(int ncg, int *move,
+                               int *index_gl, int *cgindex, int *gatindex,
+                               gmx_ga2la_t ga2la, char *bLocalCG,
                                int *cell_index)
 {
-    int cg,a0,a1,a;
-    
-    for(cg=0; cg<ncg; cg++)
+    int cg, a0, a1, a;
+
+    for (cg = 0; cg < ncg; cg++)
     {
         if (move[cg] >= 0)
         {
             a0 = cgindex[cg];
             a1 = cgindex[cg+1];
             /* Clear the global indices */
-            for(a=a0; a<a1; a++)
+            for (a = a0; a < a1; a++)
             {
-                ga2la_del(ga2la,gatindex[a]);
+                ga2la_del(ga2la, gatindex[a]);
             }
             if (bLocalCG)
             {
@@ -4179,160 +4332,166 @@ static void clear_and_mark_ind(int ncg,int *move,
 
 static void print_cg_move(FILE *fplog,
                           gmx_domdec_t *dd,
-                          gmx_large_int_t step,int cg,int dim,int dir,
-                          gmx_bool bHaveLimitdAndCMOld,real limitd,
-                          rvec cm_old,rvec cm_new,real pos_d)
+                          gmx_int64_t step, int cg, int dim, int dir,
+                          gmx_bool bHaveCgcmOld, real limitd,
+                          rvec cm_old, rvec cm_new, real pos_d)
 {
     gmx_domdec_comm_t *comm;
-    char buf[22];
+    char               buf[22];
 
     comm = dd->comm;
 
-    fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
-    if (bHaveLimitdAndCMOld)
+    fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
+    if (limitd > 0)
     {
-        fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
-                ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
+        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
+                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
+                ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
     }
     else
     {
-        fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
-                ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
+        /* We don't have a limiting distance available: don't print it */
+        fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
+                dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
+                ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
     }
-    fprintf(fplog,"distance out of cell %f\n",
-            dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
-    if (bHaveLimitdAndCMOld)
+    fprintf(fplog, "distance out of cell %f\n",
+            dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
+    if (bHaveCgcmOld)
     {
-        fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
-                cm_old[XX],cm_old[YY],cm_old[ZZ]);
+        fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
+                cm_old[XX], cm_old[YY], cm_old[ZZ]);
     }
-    fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
-            cm_new[XX],cm_new[YY],cm_new[ZZ]);
-    fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
+    fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
+            cm_new[XX], cm_new[YY], cm_new[ZZ]);
+    fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
             dim2char(dim),
-            comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
-    fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
+            comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
+    fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
             dim2char(dim),
-            comm->cell_x0[dim],comm->cell_x1[dim]);
+            comm->cell_x0[dim], comm->cell_x1[dim]);
 }
 
 static void cg_move_error(FILE *fplog,
                           gmx_domdec_t *dd,
-                          gmx_large_int_t step,int cg,int dim,int dir,
-                          gmx_bool bHaveLimitdAndCMOld,real limitd,
-                          rvec cm_old,rvec cm_new,real pos_d)
+                          gmx_int64_t step, int cg, int dim, int dir,
+                          gmx_bool bHaveCgcmOld, real limitd,
+                          rvec cm_old, rvec cm_new, real pos_d)
 {
     if (fplog)
     {
-        print_cg_move(fplog, dd,step,cg,dim,dir,
-                      bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
+        print_cg_move(fplog, dd, step, cg, dim, dir,
+                      bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
     }
-    print_cg_move(stderr,dd,step,cg,dim,dir,
-                  bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
+    print_cg_move(stderr, dd, step, cg, dim, dir,
+                  bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
     gmx_fatal(FARGS,
-              "A charge group moved too far between two domain decomposition steps\n"
-              "This usually means that your system is not well equilibrated");
+              "%s moved too far between two domain decomposition steps\n"
+              "This usually means that your system is not well equilibrated",
+              dd->comm->bCGs ? "A charge group" : "An atom");
 }
 
-static void rotate_state_atom(t_state *state,int a)
+static void rotate_state_atom(t_state *state, int a)
 {
     int est;
 
-    for(est=0; est<estNR; est++)
+    for (est = 0; est < estNR; est++)
     {
-        if (EST_DISTR(est) && (state->flags & (1<<est))) {
-            switch (est) {
-            case estX:
-                /* Rotate the complete state; for a rectangular box only */
-                state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
-                state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
-                break;
-            case estV:
-                state->v[a][YY] = -state->v[a][YY];
-                state->v[a][ZZ] = -state->v[a][ZZ];
-                break;
-            case estSDX:
-                state->sd_X[a][YY] = -state->sd_X[a][YY];
-                state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
-                break;
-            case estCGP:
-                state->cg_p[a][YY] = -state->cg_p[a][YY];
-                state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
-                break;
-            case estDISRE_INITF:
-            case estDISRE_RM3TAV:
-            case estORIRE_INITF:
-            case estORIRE_DTAV:
-                /* These are distances, so not affected by rotation */
-                break;
-            default:
-                gmx_incons("Unknown state entry encountered in rotate_state_atom");            
+        if (EST_DISTR(est) && (state->flags & (1<<est)))
+        {
+            switch (est)
+            {
+                case estX:
+                    /* Rotate the complete state; for a rectangular box only */
+                    state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
+                    state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
+                    break;
+                case estV:
+                    state->v[a][YY] = -state->v[a][YY];
+                    state->v[a][ZZ] = -state->v[a][ZZ];
+                    break;
+                case estSDX:
+                    state->sd_X[a][YY] = -state->sd_X[a][YY];
+                    state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
+                    break;
+                case estCGP:
+                    state->cg_p[a][YY] = -state->cg_p[a][YY];
+                    state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
+                    break;
+                case estDISRE_INITF:
+                case estDISRE_RM3TAV:
+                case estORIRE_INITF:
+                case estORIRE_DTAV:
+                    /* These are distances, so not affected by rotation */
+                    break;
+                default:
+                    gmx_incons("Unknown state entry encountered in rotate_state_atom");
             }
         }
     }
 }
 
-static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
+static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
 {
     if (natoms > comm->moved_nalloc)
     {
         /* Contents should be preserved here */
         comm->moved_nalloc = over_alloc_dd(natoms);
-        srenew(comm->moved,comm->moved_nalloc);
+        srenew(comm->moved, comm->moved_nalloc);
     }
 
     return comm->moved;
 }
 
-static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
+static void calc_cg_move(FILE *fplog, gmx_int64_t step,
                          gmx_domdec_t *dd,
                          t_state *state,
-                         ivec tric_dir,matrix tcm,
-                         rvec cell_x0,rvec cell_x1,
-                         rvec limitd,rvec limit0,rvec limit1,
+                         ivec tric_dir, matrix tcm,
+                         rvec cell_x0, rvec cell_x1,
+                         rvec limitd, rvec limit0, rvec limit1,
                          const int *cgindex,
-                         int cg_start,int cg_end,
+                         int cg_start, int cg_end,
                          rvec *cg_cm,
                          int *move)
 {
-    int  npbcdim;
-    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
-    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
-    int  flag;
+    int      npbcdim;
+    int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
+    int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
+    int      flag;
     gmx_bool bScrew;
-    ivec dev;
-    real inv_ncg,pos_d;
-    rvec cm_new;
+    ivec     dev;
+    real     inv_ncg, pos_d;
+    rvec     cm_new;
 
     npbcdim = dd->npbcdim;
 
-    for(cg=cg_start; cg<cg_end; cg++)
+    for (cg = cg_start; cg < cg_end; cg++)
     {
         k0   = cgindex[cg];
         k1   = cgindex[cg+1];
         nrcg = k1 - k0;
         if (nrcg == 1)
         {
-            copy_rvec(state->x[k0],cm_new);
+            copy_rvec(state->x[k0], cm_new);
         }
         else
         {
             inv_ncg = 1.0/nrcg;
-            
+
             clear_rvec(cm_new);
-            for(k=k0; (k<k1); k++)
+            for (k = k0; (k < k1); k++)
             {
-                rvec_inc(cm_new,state->x[k]);
+                rvec_inc(cm_new, state->x[k]);
             }
-            for(d=0; (d<DIM); d++)
+            for (d = 0; (d < DIM); d++)
             {
                 cm_new[d] = inv_ncg*cm_new[d];
             }
         }
-        
+
         clear_ivec(dev);
         /* Do pbc and check DD cell boundary crossings */
-        for(d=DIM-1; d>=0; d--)
+        for (d = DIM-1; d >= 0; d--)
         {
             if (dd->nc[d] > 1)
             {
@@ -4341,7 +4500,7 @@ static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
                 pos_d = cm_new[d];
                 if (tric_dir[d])
                 {
-                    for(d2=d+1; d2<DIM; d2++)
+                    for (d2 = d+1; d2 < DIM; d2++)
                     {
                         pos_d += cm_new[d2]*tcm[d2][d];
                     }
@@ -4351,24 +4510,25 @@ static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
                 {
                     if (pos_d >= limit1[d])
                     {
-                        cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
-                                      cg_cm[cg],cm_new,pos_d);
+                        cg_move_error(fplog, dd, step, cg, d, 1,
+                                      cg_cm != state->x, limitd[d],
+                                      cg_cm[cg], cm_new, pos_d);
                     }
                     dev[d] = 1;
                     if (dd->ci[d] == dd->nc[d] - 1)
                     {
-                        rvec_dec(cm_new,state->box[d]);
+                        rvec_dec(cm_new, state->box[d]);
                         if (bScrew)
                         {
                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
                         }
-                        for(k=k0; (k<k1); k++)
+                        for (k = k0; (k < k1); k++)
                         {
-                            rvec_dec(state->x[k],state->box[d]);
+                            rvec_dec(state->x[k], state->box[d]);
                             if (bScrew)
                             {
-                                rotate_state_atom(state,k);
+                                rotate_state_atom(state, k);
                             }
                         }
                     }
@@ -4377,24 +4537,25 @@ static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
                 {
                     if (pos_d < limit0[d])
                     {
-                        cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
-                                      cg_cm[cg],cm_new,pos_d);
+                        cg_move_error(fplog, dd, step, cg, d, -1,
+                                      cg_cm != state->x, limitd[d],
+                                      cg_cm[cg], cm_new, pos_d);
                     }
                     dev[d] = -1;
                     if (dd->ci[d] == 0)
                     {
-                        rvec_inc(cm_new,state->box[d]);
+                        rvec_inc(cm_new, state->box[d]);
                         if (bScrew)
                         {
                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
                         }
-                        for(k=k0; (k<k1); k++)
+                        for (k = k0; (k < k1); k++)
                         {
-                            rvec_inc(state->x[k],state->box[d]);
+                            rvec_inc(state->x[k], state->box[d]);
                             if (bScrew)
                             {
-                                rotate_state_atom(state,k);
+                                rotate_state_atom(state, k);
                             }
                         }
                     }
@@ -4405,29 +4566,29 @@ static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
                 /* Put the charge group in the rectangular unit-cell */
                 while (cm_new[d] >= state->box[d][d])
                 {
-                    rvec_dec(cm_new,state->box[d]);
-                    for(k=k0; (k<k1); k++)
+                    rvec_dec(cm_new, state->box[d]);
+                    for (k = k0; (k < k1); k++)
                     {
-                        rvec_dec(state->x[k],state->box[d]);
+                        rvec_dec(state->x[k], state->box[d]);
                     }
                 }
                 while (cm_new[d] < 0)
                 {
-                    rvec_inc(cm_new,state->box[d]);
-                    for(k=k0; (k<k1); k++)
+                    rvec_inc(cm_new, state->box[d]);
+                    for (k = k0; (k < k1); k++)
                     {
-                        rvec_inc(state->x[k],state->box[d]);
+                        rvec_inc(state->x[k], state->box[d]);
                     }
                 }
             }
         }
-    
-        copy_rvec(cm_new,cg_cm[cg]);
-        
+
+        copy_rvec(cm_new, cg_cm[cg]);
+
         /* Determine where this cg should go */
         flag = 0;
-        mc = -1;
-        for(d=0; d<dd->ndim; d++)
+        mc   = -1;
+        for (d = 0; d < dd->ndim; d++)
         {
             dim = dd->dim[d];
             if (dev[dim] == 1)
@@ -4441,7 +4602,8 @@ static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
             else if (dev[dim] == -1)
             {
                 flag |= DD_FLAG_BW(d);
-                if (mc == -1) {
+                if (mc == -1)
+                {
                     if (dd->nc[dim] > 2)
                     {
                         mc = d*2 + 1;
@@ -4458,79 +4620,79 @@ static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
     }
 }
 
-static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
-                               gmx_domdec_t *dd,ivec tric_dir,
-                               t_state *state,rvec **f,
-                               t_forcerec *fr,t_mdatoms *md,
+static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
+                               gmx_domdec_t *dd, ivec tric_dir,
+                               t_state *state, rvec **f,
+                               t_forcerec *fr,
                                gmx_bool bCompact,
                                t_nrnb *nrnb,
                                int *ncg_stay_home,
                                int *ncg_moved)
 {
-    int  *move;
-    int  npbcdim;
-    int  ncg[DIM*2],nat[DIM*2];
-    int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
-    int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
-    int  sbuf[2],rbuf[2];
-    int  home_pos_cg,home_pos_at,buf_pos;
-    int  flag;
-    gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
-    gmx_bool bScrew;
-    ivec dev;
-    real inv_ncg,pos_d;
-    matrix tcm;
-    rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
-    atom_id *cgindex;
-    cginfo_mb_t *cginfo_mb;
+    int               *move;
+    int                npbcdim;
+    int                ncg[DIM*2], nat[DIM*2];
+    int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
+    int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
+    int                sbuf[2], rbuf[2];
+    int                home_pos_cg, home_pos_at, buf_pos;
+    int                flag;
+    gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
+    gmx_bool           bScrew;
+    ivec               dev;
+    real               inv_ncg, pos_d;
+    matrix             tcm;
+    rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
+    atom_id           *cgindex;
+    cginfo_mb_t       *cginfo_mb;
     gmx_domdec_comm_t *comm;
-    int  *moved;
-    int  nthread,thread;
-    
+    int               *moved;
+    int                nthread, thread;
+
     if (dd->bScrewPBC)
     {
         check_screw_box(state->box);
     }
-    
+
     comm  = dd->comm;
     if (fr->cutoff_scheme == ecutsGROUP)
     {
         cg_cm = fr->cg_cm;
     }
-    
-    for(i=0; i<estNR; i++)
+
+    for (i = 0; i < estNR; i++)
     {
         if (EST_DISTR(i))
         {
             switch (i)
             {
-            case estX:   /* Always present */            break;
-            case estV:   bV   = (state->flags & (1<<i)); break;
-            case estSDX: bSDX = (state->flags & (1<<i)); break;
-            case estCGP: bCGP = (state->flags & (1<<i)); break;
-            case estLD_RNG:
-            case estLD_RNGI:
-            case estDISRE_INITF:
-            case estDISRE_RM3TAV:
-            case estORIRE_INITF:
-            case estORIRE_DTAV:
-                /* No processing required */
-                break;
-            default:
-            gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
+                case estX: /* Always present */ break;
+                case estV:   bV   = (state->flags & (1<<i)); break;
+                case estSDX: bSDX = (state->flags & (1<<i)); break;
+                case estCGP: bCGP = (state->flags & (1<<i)); break;
+                case estLD_RNG:
+                case estLD_RNGI:
+                case estDISRE_INITF:
+                case estDISRE_RM3TAV:
+                case estORIRE_INITF:
+                case estORIRE_DTAV:
+                    /* No processing required */
+                    break;
+                default:
+                    gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
             }
         }
     }
-    
+
     if (dd->ncg_tot > comm->nalloc_int)
     {
         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
-        srenew(comm->buf_int,comm->nalloc_int);
+        srenew(comm->buf_int, comm->nalloc_int);
     }
     move = comm->buf_int;
-    
+
     /* Clear the count */
-    for(c=0; c<dd->ndim*2; c++)
+    for (c = 0; c < dd->ndim*2; c++)
     {
         ncg[c] = 0;
         nat[c] = 0;
@@ -4538,7 +4700,7 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
 
     npbcdim = dd->npbcdim;
 
-    for(d=0; (d<DIM); d++)
+    for (d = 0; (d < DIM); d++)
     {
         limitd[d] = dd->comm->cellsize_min[d];
         if (d >= npbcdim && dd->ci[d] == 0)
@@ -4571,9 +4733,9 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
             limit1[d] =  GMX_FLOAT_MAX;
         }
     }
-    
-    make_tric_corr_matrix(npbcdim,state->box,tcm);
-    
+
+    make_tric_corr_matrix(npbcdim, state->box, tcm);
+
     cgindex = dd->cgindex;
 
     nthread = gmx_omp_nthreads_get(emntDomdec);
@@ -4582,22 +4744,22 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
      * and put them in the box and determine where they should go.
      */
 #pragma omp parallel for num_threads(nthread) schedule(static)
-    for(thread=0; thread<nthread; thread++)
+    for (thread = 0; thread < nthread; thread++)
     {
-        calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
-                     cell_x0,cell_x1,limitd,limit0,limit1,
+        calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
+                     cell_x0, cell_x1, limitd, limit0, limit1,
                      cgindex,
                      ( thread   *dd->ncg_home)/nthread,
                      ((thread+1)*dd->ncg_home)/nthread,
-                     fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
+                     fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
                      move);
     }
 
-    for(cg=0; cg<dd->ncg_home; cg++)
+    for (cg = 0; cg < dd->ncg_home; cg++)
     {
         if (move[cg] >= 0)
         {
-            mc = move[cg];
+            mc       = move[cg];
             flag     = mc & ~DD_FLAG_NRCG;
             mc       = mc & DD_FLAG_NRCG;
             move[cg] = mc;
@@ -4605,7 +4767,7 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
             {
                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
-                srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
+                srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
             }
             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
             /* We store the cg size in the lower 16 bits
@@ -4618,16 +4780,16 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
             nat[mc] += nrcg;
         }
     }
-    
-    inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
-    inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
+
+    inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
+    inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
 
     *ncg_moved = 0;
-    for(i=0; i<dd->ndim*2; i++)
+    for (i = 0; i < dd->ndim*2; i++)
     {
         *ncg_moved += ncg[i];
     }
-    
+
     nvec = 1;
     if (bV)
     {
@@ -4641,80 +4803,80 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
     {
         nvec++;
     }
-    
+
     /* Make sure the communication buffers are large enough */
-    for(mc=0; mc<dd->ndim*2; mc++)
+    for (mc = 0; mc < dd->ndim*2; mc++)
     {
         nvr = ncg[mc] + nat[mc]*nvec;
         if (nvr > comm->cgcm_state_nalloc[mc])
         {
             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
-            srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
+            srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
         }
     }
-    
+
     switch (fr->cutoff_scheme)
     {
-    case ecutsGROUP:
-        /* Recalculating cg_cm might be cheaper than communicating,
-         * but that could give rise to rounding issues.
-         */
-        home_pos_cg =
-            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
-                                    nvec,cg_cm,comm,bCompact);
-    break;
-    case ecutsVERLET:
-        /* Without charge groups we send the moved atom coordinates
-         * over twice. This is so the code below can be used without
-         * many conditionals for both for with and without charge groups.
-         */
-        home_pos_cg =
-            compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
-                                    nvec,state->x,comm,FALSE);
-        if (bCompact)
-        {
-            home_pos_cg -= *ncg_moved;
-        }
-        break;
-    default:
-        gmx_incons("unimplemented");
-        home_pos_cg = 0;
+        case ecutsGROUP:
+            /* Recalculating cg_cm might be cheaper than communicating,
+             * but that could give rise to rounding issues.
+             */
+            home_pos_cg =
+                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
+                                        nvec, cg_cm, comm, bCompact);
+            break;
+        case ecutsVERLET:
+            /* Without charge groups we send the moved atom coordinates
+             * over twice. This is so the code below can be used without
+             * many conditionals for both for with and without charge groups.
+             */
+            home_pos_cg =
+                compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
+                                        nvec, state->x, comm, FALSE);
+            if (bCompact)
+            {
+                home_pos_cg -= *ncg_moved;
+            }
+            break;
+        default:
+            gmx_incons("unimplemented");
+            home_pos_cg = 0;
     }
-    
-    vec = 0;
+
+    vec         = 0;
     home_pos_at =
-        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
-                                nvec,vec++,state->x,comm,bCompact);
+        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
+                                nvec, vec++, state->x, comm, bCompact);
     if (bV)
     {
-        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
-                                nvec,vec++,state->v,comm,bCompact);
+        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
+                                nvec, vec++, state->v, comm, bCompact);
     }
     if (bSDX)
     {
-        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
-                                nvec,vec++,state->sd_X,comm,bCompact);
+        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
+                                nvec, vec++, state->sd_X, comm, bCompact);
     }
     if (bCGP)
     {
-        compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
-                                nvec,vec++,state->cg_p,comm,bCompact);
+        compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
+                                nvec, vec++, state->cg_p, comm, bCompact);
     }
-    
+
     if (bCompact)
     {
-        compact_ind(dd->ncg_home,move,
-                    dd->index_gl,dd->cgindex,dd->gatindex,
-                    dd->ga2la,comm->bLocalCG,
+        compact_ind(dd->ncg_home, move,
+                    dd->index_gl, dd->cgindex, dd->gatindex,
+                    dd->ga2la, comm->bLocalCG,
                     fr->cginfo);
     }
     else
     {
         if (fr->cutoff_scheme == ecutsVERLET)
         {
-            moved = get_moved(comm,dd->ncg_home);
+            moved = get_moved(comm, dd->ncg_home);
 
-            for(k=0; k<dd->ncg_home; k++)
+            for (k = 0; k < dd->ncg_home; k++)
             {
                 moved[k] = 0;
             }
@@ -4724,22 +4886,22 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
             moved = fr->ns.grid->cell_index;
         }
 
-        clear_and_mark_ind(dd->ncg_home,move,
-                           dd->index_gl,dd->cgindex,dd->gatindex,
-                           dd->ga2la,comm->bLocalCG,
+        clear_and_mark_ind(dd->ncg_home, move,
+                           dd->index_gl, dd->cgindex, dd->gatindex,
+                           dd->ga2la, comm->bLocalCG,
                            moved);
     }
-    
+
     cginfo_mb = fr->cginfo_mb;
 
     *ncg_stay_home = home_pos_cg;
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
-        dim = dd->dim[d];
+        dim      = dd->dim[d];
         ncg_recv = 0;
         nat_recv = 0;
         nvr      = 0;
-        for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
+        for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
         {
             cdd = d*2 + dir;
             /* Communicate the cg and atom counts */
@@ -4747,26 +4909,26 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
             sbuf[1] = nat[cdd];
             if (debug)
             {
-                fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
-                        d,dir,sbuf[0],sbuf[1]);
+                fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
+                        d, dir, sbuf[0], sbuf[1]);
             }
             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
-            
+
             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
             {
                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
-                srenew(comm->buf_int,comm->nalloc_int);
+                srenew(comm->buf_int, comm->nalloc_int);
             }
-            
+
             /* Communicate the charge group indices, sizes and flags */
             dd_sendrecv_int(dd, d, dir,
                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
-            
+
             nvs = ncg[cdd] + nat[cdd]*nvec;
             i   = rbuf[0]  + rbuf[1] *nvec;
-            vec_rvec_check_alloc(&comm->vbuf,nvr+i);
-            
+            vec_rvec_check_alloc(&comm->vbuf, nvr+i);
+
             /* Communicate cgcm and state */
             dd_sendrecv_rvec(dd, d, dir,
                              comm->cgcm_state[cdd], nvs,
@@ -4775,10 +4937,10 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
             nat_recv += rbuf[1];
             nvr      += i;
         }
-        
+
         /* Process the received charge groups */
         buf_pos = 0;
-        for(cg=0; cg<ncg_recv; cg++)
+        for (cg = 0; cg < ncg_recv; cg++)
         {
             flag = comm->buf_int[cg*DD_CGIBS+1];
 
@@ -4792,12 +4954,12 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
                     ((flag & DD_FLAG_BW(d)) &&
                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
                 {
-                    cg_move_error(fplog,dd,step,cg,dim,
+                    cg_move_error(fplog, dd, step, cg, dim,
                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
-                                   FALSE,0,
-                                   comm->vbuf.v[buf_pos],
-                                   comm->vbuf.v[buf_pos],
-                                   comm->vbuf.v[buf_pos][dim]);
+                                  fr->cutoff_scheme == ecutsGROUP, 0,
+                                  comm->vbuf.v[buf_pos],
+                                  comm->vbuf.v[buf_pos],
+                                  comm->vbuf.v[buf_pos][dim]);
                 }
             }
 
@@ -4805,7 +4967,7 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
             if (d < dd->ndim-1)
             {
                 /* Check which direction this cg should go */
-                for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
+                for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
                 {
                     if (dd->bGridJump)
                     {
@@ -4832,7 +4994,7 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
                             pos_d = comm->vbuf.v[buf_pos][dim2];
                             if (tric_dir[dim2])
                             {
-                                for(d3=dim2+1; d3<DIM; d3++)
+                                for (d3 = dim2+1; d3 < DIM; d3++)
                                 {
                                     pos_d +=
                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
@@ -4874,25 +5036,25 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
                     }
                 }
             }
-            
+
             nrcg = flag & DD_FLAG_NRCG;
             if (mc == -1)
             {
                 if (home_pos_cg+1 > dd->cg_nalloc)
                 {
                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
-                    srenew(dd->index_gl,dd->cg_nalloc);
-                    srenew(dd->cgindex,dd->cg_nalloc+1);
+                    srenew(dd->index_gl, dd->cg_nalloc);
+                    srenew(dd->cgindex, dd->cg_nalloc+1);
                 }
                 /* Set the global charge group index and size */
-                dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
+                dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
                 /* Copy the state from the buffer */
-                dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
+                dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
                 if (fr->cutoff_scheme == ecutsGROUP)
                 {
                     cg_cm = fr->cg_cm;
-                    copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
+                    copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
                 }
                 buf_pos++;
 
@@ -4906,16 +5068,16 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
 
                 if (home_pos_at+nrcg > state->nalloc)
                 {
-                    dd_realloc_state(state,f,home_pos_at+nrcg);
+                    dd_realloc_state(state, f, home_pos_at+nrcg);
                 }
-                for(i=0; i<nrcg; i++)
+                for (i = 0; i < nrcg; i++)
                 {
                     copy_rvec(comm->vbuf.v[buf_pos++],
                               state->x[home_pos_at+i]);
                 }
                 if (bV)
                 {
-                    for(i=0; i<nrcg; i++)
+                    for (i = 0; i < nrcg; i++)
                     {
                         copy_rvec(comm->vbuf.v[buf_pos++],
                                   state->v[home_pos_at+i]);
@@ -4923,7 +5085,7 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
                 }
                 if (bSDX)
                 {
-                    for(i=0; i<nrcg; i++)
+                    for (i = 0; i < nrcg; i++)
                     {
                         copy_rvec(comm->vbuf.v[buf_pos++],
                                   state->sd_X[home_pos_at+i]);
@@ -4931,7 +5093,7 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
                 }
                 if (bCGP)
                 {
-                    for(i=0; i<nrcg; i++)
+                    for (i = 0; i < nrcg; i++)
                     {
                         copy_rvec(comm->vbuf.v[buf_pos++],
                                   state->cg_p[home_pos_at+i]);
@@ -4946,13 +5108,13 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
                 {
                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
-                    srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
+                    srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
                 }
                 nvr = ncg[mc] + nat[mc]*nvec;
                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
                 {
                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
-                    srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
+                    srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
                 }
                 /* Copy from the receive to the send buffers */
                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
@@ -4967,16 +5129,16 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
             }
         }
     }
-    
+
     /* With sorting (!bCompact) the indices are now only partially up to date
      * and ncg_home and nat_home are not the real count, since there are
      * "holes" in the arrays for the charge groups that moved to neighbors.
      */
     if (fr->cutoff_scheme == ecutsVERLET)
     {
-        moved = get_moved(comm,home_pos_cg);
+        moved = get_moved(comm, home_pos_cg);
 
-        for(i=dd->ncg_home; i<home_pos_cg; i++)
+        for (i = dd->ncg_home; i < home_pos_cg; i++)
         {
             moved[i] = 0;
         }
@@ -4988,12 +5150,12 @@ static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
     {
         fprintf(debug,
                 "Finished repartitioning: cgs moved out %d, new home %d\n",
-                *ncg_moved,dd->ncg_home-*ncg_moved);
-                
+                *ncg_moved, dd->ncg_home-*ncg_moved);
+
     }
 }
 
-void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
+void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
 {
     dd->comm->cycl[ddCycl] += cycles;
     dd->comm->cycl_n[ddCycl]++;
@@ -5005,18 +5167,18 @@ void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
 
 static double force_flop_count(t_nrnb *nrnb)
 {
-    int i;
-    double sum;
+    int         i;
+    double      sum;
     const char *name;
 
     sum = 0;
-    for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
+    for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
     {
         /* To get closer to the real timings, we half the count
          * for the normal loops and again half it for water loops.
          */
         name = nrnb_str(i);
-        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
+        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
         {
             sum += nrnb->n[i]*0.25*cost_nrnb(i);
         }
@@ -5025,13 +5187,15 @@ static double force_flop_count(t_nrnb *nrnb)
             sum += nrnb->n[i]*0.50*cost_nrnb(i);
         }
     }
-    for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
+    for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
     {
         name = nrnb_str(i);
-        if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
-        sum += nrnb->n[i]*cost_nrnb(i);
+        if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
+        {
+            sum += nrnb->n[i]*cost_nrnb(i);
+        }
     }
-    for(i=eNR_BONDS; i<=eNR_WALLS; i++)
+    for (i = eNR_BONDS; i <= eNR_WALLS; i++)
     {
         sum += nrnb->n[i]*cost_nrnb(i);
     }
@@ -5039,62 +5203,62 @@ static double force_flop_count(t_nrnb *nrnb)
     return sum;
 }
 
-void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
+void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
 {
     if (dd->comm->eFlop)
     {
         dd->comm->flop -= force_flop_count(nrnb);
     }
 }
-void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
+void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
 {
     if (dd->comm->eFlop)
     {
         dd->comm->flop += force_flop_count(nrnb);
         dd->comm->flop_n++;
     }
-}  
+}
 
 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
 {
     int i;
-    
-    for(i=0; i<ddCyclNr; i++)
+
+    for (i = 0; i < ddCyclNr; i++)
     {
-        dd->comm->cycl[i] = 0;
-        dd->comm->cycl_n[i] = 0;
+        dd->comm->cycl[i]     = 0;
+        dd->comm->cycl_n[i]   = 0;
         dd->comm->cycl_max[i] = 0;
     }
-    dd->comm->flop = 0;
+    dd->comm->flop   = 0;
     dd->comm->flop_n = 0;
 }
 
-static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
+static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
 {
     gmx_domdec_comm_t *comm;
     gmx_domdec_load_t *load;
-    gmx_domdec_root_t *root=NULL;
-    int  d,dim,cid,i,pos;
-    float cell_frac=0,sbuf[DD_NLOAD_MAX];
-    gmx_bool bSepPME;
-    
+    gmx_domdec_root_t *root = NULL;
+    int                d, dim, cid, i, pos;
+    float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
+    gmx_bool           bSepPME;
+
     if (debug)
     {
-        fprintf(debug,"get_load_distribution start\n");
+        fprintf(debug, "get_load_distribution start\n");
     }
 
-    wallcycle_start(wcycle,ewcDDCOMMLOAD);
-    
+    wallcycle_start(wcycle, ewcDDCOMMLOAD);
+
     comm = dd->comm;
-    
+
     bSepPME = (dd->pme_nodeid >= 0);
-    
-    for(d=dd->ndim-1; d>=0; d--)
+
+    for (d = dd->ndim-1; d >= 0; d--)
     {
         dim = dd->dim[d];
         /* Check if we participate in the communication in this dimension */
-        if (d == dd->ndim-1 || 
-            (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
+        if (d == dd->ndim-1 ||
+            (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
         {
             load = &comm->load[d];
             if (dd->bGridJump)
@@ -5148,9 +5312,9 @@ static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
              * The communicators are setup such that the root always has rank 0.
              */
 #ifdef GMX_MPI
-            MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
-                       load->load,load->nload*sizeof(float),MPI_BYTE,
-                       0,comm->mpi_comm_load[d]);
+            MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
+                       load->load, load->nload*sizeof(float), MPI_BYTE,
+                       0, comm->mpi_comm_load[d]);
 #endif
             if (dd->ci[dim] == dd->master_ci[dim])
             {
@@ -5159,18 +5323,18 @@ static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
                 {
                     root = comm->root[d];
                 }
-                load->sum = 0;
-                load->max = 0;
-                load->sum_m = 0;
+                load->sum      = 0;
+                load->max      = 0;
+                load->sum_m    = 0;
                 load->cvol_min = 1;
-                load->flags = 0;
-                load->mdf = 0;
-                load->pme = 0;
-                pos = 0;
-                for(i=0; i<dd->nc[dim]; i++)
+                load->flags    = 0;
+                load->mdf      = 0;
+                load->pme      = 0;
+                pos            = 0;
+                for (i = 0; i < dd->nc[dim]; i++)
                 {
                     load->sum += load->load[pos++];
-                    load->max = max(load->max,load->load[pos]);
+                    load->max  = max(load->max, load->load[pos]);
                     pos++;
                     if (dd->bGridJump)
                     {
@@ -5179,14 +5343,14 @@ static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
                             /* This direction could not be load balanced properly,
                              * therefore we need to use the maximum iso the average load.
                              */
-                            load->sum_m = max(load->sum_m,load->load[pos]);
+                            load->sum_m = max(load->sum_m, load->load[pos]);
                         }
                         else
                         {
                             load->sum_m += load->load[pos];
                         }
                         pos++;
-                        load->cvol_min = min(load->cvol_min,load->load[pos]);
+                        load->cvol_min = min(load->cvol_min, load->load[pos]);
                         pos++;
                         if (d < dd->ndim-1)
                         {
@@ -5200,9 +5364,9 @@ static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
                     }
                     if (bSepPME)
                     {
-                        load->mdf = max(load->mdf,load->load[pos]);
+                        load->mdf = max(load->mdf, load->load[pos]);
                         pos++;
-                        load->pme = max(load->pme,load->load[pos]);
+                        load->pme = max(load->pme, load->load[pos]);
                         pos++;
                     }
                 }
@@ -5223,7 +5387,7 @@ static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
         comm->load_max   += comm->load[0].max;
         if (comm->bDynLoadBal)
         {
-            for(d=0; d<dd->ndim; d++)
+            for (d = 0; d < dd->ndim; d++)
             {
                 if (comm->load[0].flags & (1<<d))
                 {
@@ -5238,11 +5402,11 @@ static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
         }
     }
 
-    wallcycle_stop(wcycle,ewcDDCOMMLOAD);
-    
+    wallcycle_stop(wcycle, ewcDDCOMMLOAD);
+
     if (debug)
     {
-        fprintf(debug,"get_load_distribution finished\n");
+        fprintf(debug, "get_load_distribution finished\n");
     }
 }
 
@@ -5263,12 +5427,12 @@ static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
     }
 }
 
-static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
+static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
 {
-    char  buf[STRLEN];
-    int   npp,npme,nnodes,d,limp;
-    float imbal,pme_f_ratio,lossf,lossp=0;
-    gmx_bool  bLim;
+    char               buf[STRLEN];
+    int                npp, npme, nnodes, d, limp;
+    float              imbal, pme_f_ratio, lossf, lossp = 0;
+    gmx_bool           bLim;
     gmx_domdec_comm_t *comm;
 
     comm = dd->comm;
@@ -5277,36 +5441,36 @@ static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
         npp    = dd->nnodes;
         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
         nnodes = npp + npme;
-        imbal = comm->load_max*npp/comm->load_sum - 1;
-        lossf = dd_force_imb_perf_loss(dd);
-        sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
-        fprintf(fplog,"%s",buf);
-        fprintf(stderr,"\n");
-        fprintf(stderr,"%s",buf);
-        sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
-        fprintf(fplog,"%s",buf);
-        fprintf(stderr,"%s",buf);
+        imbal  = comm->load_max*npp/comm->load_sum - 1;
+        lossf  = dd_force_imb_perf_loss(dd);
+        sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
+        fprintf(fplog, "%s", buf);
+        fprintf(stderr, "\n");
+        fprintf(stderr, "%s", buf);
+        sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
+        fprintf(fplog, "%s", buf);
+        fprintf(stderr, "%s", buf);
         bLim = FALSE;
         if (comm->bDynLoadBal)
         {
-            sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
-            for(d=0; d<dd->ndim; d++)
+            sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
+            for (d = 0; d < dd->ndim; d++)
             {
                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
-                sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
+                sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
                 if (limp >= 50)
                 {
                     bLim = TRUE;
                 }
             }
-            sprintf(buf+strlen(buf),"\n");
-            fprintf(fplog,"%s",buf);
-            fprintf(stderr,"%s",buf);
+            sprintf(buf+strlen(buf), "\n");
+            fprintf(fplog, "%s", buf);
+            fprintf(stderr, "%s", buf);
         }
         if (npme > 0)
         {
             pme_f_ratio = comm->load_pme/comm->load_mdf;
-            lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
+            lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
             if (lossp <= 0)
             {
                 lossp *= (float)npme/(float)nnodes;
@@ -5315,45 +5479,45 @@ static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
             {
                 lossp *= (float)npp/(float)nnodes;
             }
-            sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
-            fprintf(fplog,"%s",buf);
-            fprintf(stderr,"%s",buf);
-            sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
-            fprintf(fplog,"%s",buf);
-            fprintf(stderr,"%s",buf);
+            sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
+            fprintf(fplog, "%s", buf);
+            fprintf(stderr, "%s", buf);
+            sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
+            fprintf(fplog, "%s", buf);
+            fprintf(stderr, "%s", buf);
         }
-        fprintf(fplog,"\n");
-        fprintf(stderr,"\n");
-        
-        if (lossf >= DD_PERF_LOSS)
+        fprintf(fplog, "\n");
+        fprintf(stderr, "\n");
+
+        if (lossf >= DD_PERF_LOSS_WARN)
         {
             sprintf(buf,
-                    "NOTE: %.1f %% performance was lost due to load imbalance\n"
-                    "      in the domain decomposition.\n",lossf*100);
+                    "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
+                    "      in the domain decomposition.\n", lossf*100);
             if (!comm->bDynLoadBal)
             {
-                sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
+                sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
             }
             else if (bLim)
             {
-                sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
+                sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
             }
-            fprintf(fplog,"%s\n",buf);
-            fprintf(stderr,"%s\n",buf);
+            fprintf(fplog, "%s\n", buf);
+            fprintf(stderr, "%s\n", buf);
         }
-        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
+        if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
         {
             sprintf(buf,
-                    "NOTE: %.1f %% performance was lost because the PME nodes\n"
-                    "      had %s work to do than the PP nodes.\n"
-                    "      You might want to %s the number of PME nodes\n"
+                    "NOTE: %.1f %% performance was lost because the PME ranks\n"
+                    "      had %s work to do than the PP ranks.\n"
+                    "      You might want to %s the number of PME ranks\n"
                     "      or %s the cut-off and the grid spacing.\n",
                     fabs(lossp*100),
                     (lossp < 0) ? "less"     : "more",
                     (lossp < 0) ? "decrease" : "increase",
                     (lossp < 0) ? "decrease" : "increase");
-            fprintf(fplog,"%s\n",buf);
-            fprintf(stderr,"%s\n",buf);
+            fprintf(fplog, "%s\n", buf);
+            fprintf(stderr, "%s\n", buf);
         }
     }
 }
@@ -5385,75 +5549,75 @@ float dd_pme_f_ratio(gmx_domdec_t *dd)
     }
 }
 
-static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
+static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
 {
-    int flags,d;
+    int  flags, d;
     char buf[22];
-    
+
     flags = dd_load_flags(dd);
     if (flags)
     {
         fprintf(fplog,
                 "DD  load balancing is limited by minimum cell size in dimension");
-        for(d=0; d<dd->ndim; d++)
+        for (d = 0; d < dd->ndim; d++)
         {
             if (flags & (1<<d))
             {
-                fprintf(fplog," %c",dim2char(dd->dim[d]));
+                fprintf(fplog, " %c", dim2char(dd->dim[d]));
             }
         }
-        fprintf(fplog,"\n");
+        fprintf(fplog, "\n");
     }
-    fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
+    fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
     if (dd->comm->bDynLoadBal)
     {
-        fprintf(fplog,"  vol min/aver %5.3f%c",
-                dd_vol_min(dd),flags ? '!' : ' ');
+        fprintf(fplog, "  vol min/aver %5.3f%c",
+                dd_vol_min(dd), flags ? '!' : ' ');
     }
-    fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
+    fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
     if (dd->comm->cycl_n[ddCyclPME])
     {
-        fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
+        fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
     }
-    fprintf(fplog,"\n\n");
+    fprintf(fplog, "\n\n");
 }
 
 static void dd_print_load_verbose(gmx_domdec_t *dd)
 {
     if (dd->comm->bDynLoadBal)
     {
-        fprintf(stderr,"vol %4.2f%c ",
-                dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
+        fprintf(stderr, "vol %4.2f%c ",
+                dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
     }
-    fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
+    fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
     if (dd->comm->cycl_n[ddCyclPME])
     {
-        fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
+        fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
     }
 }
 
 #ifdef GMX_MPI
-static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
+static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
 {
-    MPI_Comm  c_row;
-    int  dim, i, rank;
-    ivec loc_c;
+    MPI_Comm           c_row;
+    int                dim, i, rank;
+    ivec               loc_c;
     gmx_domdec_root_t *root;
-    gmx_bool bPartOfGroup = FALSE;
-    
+    gmx_bool           bPartOfGroup = FALSE;
+
     dim = dd->dim[dim_ind];
-    copy_ivec(loc,loc_c);
-    for(i=0; i<dd->nc[dim]; i++)
+    copy_ivec(loc, loc_c);
+    for (i = 0; i < dd->nc[dim]; i++)
     {
         loc_c[dim] = i;
-        rank = dd_index(dd->nc,loc_c);
+        rank       = dd_index(dd->nc, loc_c);
         if (rank == dd->rank)
         {
             /* This process is part of the group */
             bPartOfGroup = TRUE;
         }
     }
-    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
+    MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
                    &c_row);
     if (bPartOfGroup)
     {
@@ -5463,160 +5627,220 @@ static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
             if (dd->ci[dim] == dd->master_ci[dim])
             {
                 /* This is the root process of this row */
-                snew(dd->comm->root[dim_ind],1);
+                snew(dd->comm->root[dim_ind], 1);
                 root = dd->comm->root[dim_ind];
-                snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
-                snew(root->old_cell_f,dd->nc[dim]+1);
-                snew(root->bCellMin,dd->nc[dim]);
+                snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
+                snew(root->old_cell_f, dd->nc[dim]+1);
+                snew(root->bCellMin, dd->nc[dim]);
                 if (dim_ind > 0)
                 {
-                    snew(root->cell_f_max0,dd->nc[dim]);
-                    snew(root->cell_f_min1,dd->nc[dim]);
-                    snew(root->bound_min,dd->nc[dim]);
-                    snew(root->bound_max,dd->nc[dim]);
+                    snew(root->cell_f_max0, dd->nc[dim]);
+                    snew(root->cell_f_min1, dd->nc[dim]);
+                    snew(root->bound_min, dd->nc[dim]);
+                    snew(root->bound_max, dd->nc[dim]);
                 }
-                snew(root->buf_ncd,dd->nc[dim]);
+                snew(root->buf_ncd, dd->nc[dim]);
             }
             else
             {
                 /* This is not a root process, we only need to receive cell_f */
-                snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
+                snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
             }
         }
         if (dd->ci[dim] == dd->master_ci[dim])
         {
-            snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
+            snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
         }
     }
 }
 #endif
 
-static void make_load_communicators(gmx_domdec_t *dd)
+void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
+                                   const gmx_hw_info_t gmx_unused *hwinfo,
+                                   const gmx_hw_opt_t  gmx_unused *hw_opt)
 {
 #ifdef GMX_MPI
-  int  dim0,dim1,i,j;
-  ivec loc;
-
-  if (debug)
-    fprintf(debug,"Making load communicators\n");
-
-  snew(dd->comm->load,dd->ndim);
-  snew(dd->comm->mpi_comm_load,dd->ndim);
-  
-  clear_ivec(loc);
-  make_load_communicator(dd,0,loc);
-  if (dd->ndim > 1) {
-    dim0 = dd->dim[0];
-    for(i=0; i<dd->nc[dim0]; i++) {
-      loc[dim0] = i;
-      make_load_communicator(dd,1,loc);
+    int           physicalnode_id_hash;
+    int           gpu_id;
+    gmx_domdec_t *dd;
+    MPI_Comm      mpi_comm_pp_physicalnode;
+
+    if (!(cr->duty & DUTY_PP) ||
+        hw_opt->gpu_opt.ncuda_dev_use == 0)
+    {
+        /* Only PP nodes (currently) use GPUs.
+         * If we don't have GPUs, there are no resources to share.
+         */
+        return;
+    }
+
+    physicalnode_id_hash = gmx_physicalnode_id_hash();
+
+    gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
+
+    dd = cr->dd;
+
+    if (debug)
+    {
+        fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
+        fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
+                dd->rank, physicalnode_id_hash, gpu_id);
+    }
+    /* Split the PP communicator over the physical nodes */
+    /* TODO: See if we should store this (before), as it's also used for
+     * for the nodecomm summution.
+     */
+    MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
+                   &mpi_comm_pp_physicalnode);
+    MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
+                   &dd->comm->mpi_comm_gpu_shared);
+    MPI_Comm_free(&mpi_comm_pp_physicalnode);
+    MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
+
+    if (debug)
+    {
+        fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
+    }
+
+    /* Note that some ranks could share a GPU, while others don't */
+
+    if (dd->comm->nrank_gpu_shared == 1)
+    {
+        MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
     }
-  }
-  if (dd->ndim > 2) {
-    dim0 = dd->dim[0];
-    for(i=0; i<dd->nc[dim0]; i++) {
-      loc[dim0] = i;
-      dim1 = dd->dim[1];
-      for(j=0; j<dd->nc[dim1]; j++) {
-         loc[dim1] = j;
-         make_load_communicator(dd,2,loc);
-      }
-    }
-  }
-
-  if (debug)
-    fprintf(debug,"Finished making load communicators\n");
 #endif
 }
 
-void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
+static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
 {
-    gmx_bool bZYX;
-    int  d,dim,i,j,m;
-    ivec tmp,s;
-    int  nzone,nzonep;
-    ivec dd_zp[DD_MAXIZONE];
-    gmx_domdec_zones_t *zones;
+#ifdef GMX_MPI
+    int  dim0, dim1, i, j;
+    ivec loc;
+
+    if (debug)
+    {
+        fprintf(debug, "Making load communicators\n");
+    }
+
+    snew(dd->comm->load, dd->ndim);
+    snew(dd->comm->mpi_comm_load, dd->ndim);
+
+    clear_ivec(loc);
+    make_load_communicator(dd, 0, loc);
+    if (dd->ndim > 1)
+    {
+        dim0 = dd->dim[0];
+        for (i = 0; i < dd->nc[dim0]; i++)
+        {
+            loc[dim0] = i;
+            make_load_communicator(dd, 1, loc);
+        }
+    }
+    if (dd->ndim > 2)
+    {
+        dim0 = dd->dim[0];
+        for (i = 0; i < dd->nc[dim0]; i++)
+        {
+            loc[dim0] = i;
+            dim1      = dd->dim[1];
+            for (j = 0; j < dd->nc[dim1]; j++)
+            {
+                loc[dim1] = j;
+                make_load_communicator(dd, 2, loc);
+            }
+        }
+    }
+
+    if (debug)
+    {
+        fprintf(debug, "Finished making load communicators\n");
+    }
+#endif
+}
+
+void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
+{
+    gmx_bool                bZYX;
+    int                     d, dim, i, j, m;
+    ivec                    tmp, s;
+    int                     nzone, nzonep;
+    ivec                    dd_zp[DD_MAXIZONE];
+    gmx_domdec_zones_t     *zones;
     gmx_domdec_ns_ranges_t *izone;
-    
-    for(d=0; d<dd->ndim; d++)
+
+    for (d = 0; d < dd->ndim; d++)
     {
         dim = dd->dim[d];
-        copy_ivec(dd->ci,tmp);
-        tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
-        dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
-        copy_ivec(dd->ci,tmp);
-        tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
-        dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
+        copy_ivec(dd->ci, tmp);
+        tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
+        dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
+        copy_ivec(dd->ci, tmp);
+        tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
+        dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
         if (debug)
         {
-            fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
-                    dd->rank,dim,
+            fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
+                    dd->rank, dim,
                     dd->neighbor[d][0],
                     dd->neighbor[d][1]);
         }
     }
-    
-    if (DDMASTER(dd))
-    {
-        fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
-           dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
-    }
+
     if (fplog)
     {
-        fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
+        fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
                 dd->ndim,
-                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
-                dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
+                dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
+                dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
     }
     switch (dd->ndim)
     {
-    case 3:
-        nzone  = dd_z3n;
-        nzonep = dd_zp3n;
-        for(i=0; i<nzonep; i++)
-        {
-            copy_ivec(dd_zp3[i],dd_zp[i]);
-        }
-        break;
-    case 2:
-        nzone  = dd_z2n;
-        nzonep = dd_zp2n;
-        for(i=0; i<nzonep; i++)
-        {
-            copy_ivec(dd_zp2[i],dd_zp[i]);
-        }
-        break;
-    case 1:
-        nzone  = dd_z1n;
-        nzonep = dd_zp1n;
-        for(i=0; i<nzonep; i++)
-        {
-            copy_ivec(dd_zp1[i],dd_zp[i]);
-        }
-        break;
-    default:
-        gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
-        nzone = 0;
-        nzonep = 0;
+        case 3:
+            nzone  = dd_z3n;
+            nzonep = dd_zp3n;
+            for (i = 0; i < nzonep; i++)
+            {
+                copy_ivec(dd_zp3[i], dd_zp[i]);
+            }
+            break;
+        case 2:
+            nzone  = dd_z2n;
+            nzonep = dd_zp2n;
+            for (i = 0; i < nzonep; i++)
+            {
+                copy_ivec(dd_zp2[i], dd_zp[i]);
+            }
+            break;
+        case 1:
+            nzone  = dd_z1n;
+            nzonep = dd_zp1n;
+            for (i = 0; i < nzonep; i++)
+            {
+                copy_ivec(dd_zp1[i], dd_zp[i]);
+            }
+            break;
+        default:
+            gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
+            nzone  = 0;
+            nzonep = 0;
     }
 
     zones = &dd->comm->zones;
 
-    for(i=0; i<nzone; i++)
+    for (i = 0; i < nzone; i++)
     {
         m = 0;
         clear_ivec(zones->shift[i]);
-        for(d=0; d<dd->ndim; d++)
+        for (d = 0; d < dd->ndim; d++)
         {
             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
         }
     }
-    
+
     zones->n = nzone;
-    for(i=0; i<nzone; i++)
+    for (i = 0; i < nzone; i++)
     {
-        for(d=0; d<DIM; d++)
+        for (d = 0; d < DIM; d++)
         {
             s[d] = dd->ci[d] - zones->shift[i][d];
             if (s[d] < 0)
@@ -5630,16 +5854,16 @@ void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
         }
     }
     zones->nizone = nzonep;
-    for(i=0; i<zones->nizone; i++)
+    for (i = 0; i < zones->nizone; i++)
     {
         if (dd_zp[i][0] != i)
         {
-            gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
+            gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
         }
-        izone = &zones->izone[i];
+        izone     = &zones->izone[i];
         izone->j0 = dd_zp[i][1];
         izone->j1 = dd_zp[i][2];
-        for(dim=0; dim<DIM; dim++)
+        for (dim = 0; dim < DIM; dim++)
         {
             if (dd->nc[dim] == 1)
             {
@@ -5650,22 +5874,22 @@ void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
             else
             {
                 /*
-                  izone->shift0[d] = 0;
-                  izone->shift1[d] = 0;
-                  for(j=izone->j0; j<izone->j1; j++) {
-                  if (dd->shift[j][d] > dd->shift[i][d])
-                  izone->shift0[d] = -1;
-                  if (dd->shift[j][d] < dd->shift[i][d])
-                  izone->shift1[d] = 1;
-                  }
-                */
-                
+                   izone->shift0[d] = 0;
+                   izone->shift1[d] = 0;
+                   for(j=izone->j0; j<izone->j1; j++) {
+                   if (dd->shift[j][d] > dd->shift[i][d])
+                   izone->shift0[d] = -1;
+                   if (dd->shift[j][d] < dd->shift[i][d])
+                   izone->shift1[d] = 1;
+                   }
+                 */
+
                 int shift_diff;
-                
+
                 /* Assume the shift are not more than 1 cell */
                 izone->shift0[dim] = 1;
                 izone->shift1[dim] = -1;
-                for(j=izone->j0; j<izone->j1; j++)
+                for (j = izone->j0; j < izone->j1; j++)
                 {
                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
                     if (shift_diff < izone->shift0[dim])
@@ -5680,62 +5904,62 @@ void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
             }
         }
     }
-    
+
     if (dd->comm->eDLB != edlbNO)
     {
-        snew(dd->comm->root,dd->ndim);
+        snew(dd->comm->root, dd->ndim);
     }
-    
+
     if (dd->comm->bRecordLoad)
     {
         make_load_communicators(dd);
     }
 }
 
-static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
+static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
 {
-    gmx_domdec_t *dd;
+    gmx_domdec_t      *dd;
     gmx_domdec_comm_t *comm;
-    int  i,rank,*buf;
-    ivec periods;
+    int                i, rank, *buf;
+    ivec               periods;
 #ifdef GMX_MPI
-    MPI_Comm comm_cart;
+    MPI_Comm           comm_cart;
 #endif
-    
-    dd = cr->dd;
+
+    dd   = cr->dd;
     comm = dd->comm;
-    
+
 #ifdef GMX_MPI
     if (comm->bCartesianPP)
     {
         /* Set up cartesian communication for the particle-particle part */
         if (fplog)
         {
-            fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
-                    dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
+            fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
+                    dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
         }
-        
-        for(i=0; i<DIM; i++)
+
+        for (i = 0; i < DIM; i++)
         {
             periods[i] = TRUE;
         }
-        MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
+        MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
                         &comm_cart);
         /* We overwrite the old communicator with the new cartesian one */
         cr->mpi_comm_mygroup = comm_cart;
     }
-    
+
     dd->mpi_comm_all = cr->mpi_comm_mygroup;
-    MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
-    
+    MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
+
     if (comm->bCartesianPP_PME)
     {
         /* Since we want to use the original cartesian setup for sim,
          * and not the one after split, we need to make an index.
          */
-        snew(comm->ddindex2ddnodeid,dd->nnodes);
-        comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
-        gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
+        snew(comm->ddindex2ddnodeid, dd->nnodes);
+        comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
+        gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
         /* Get the rank of the DD master,
          * above we made sure that the master node is a PP node.
          */
@@ -5747,7 +5971,7 @@ static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
         {
             rank = 0;
         }
-        MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
+        MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
     }
     else if (comm->bCartesianPP)
     {
@@ -5759,86 +5983,86 @@ static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
         }
         cr->nodeid = dd->rank;
-        
-        MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
-        
+
+        MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
+
         /* We need to make an index to go from the coordinates
          * to the nodeid of this simulation.
          */
-        snew(comm->ddindex2simnodeid,dd->nnodes);
-        snew(buf,dd->nnodes);
+        snew(comm->ddindex2simnodeid, dd->nnodes);
+        snew(buf, dd->nnodes);
         if (cr->duty & DUTY_PP)
         {
-            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
+            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
         }
         /* Communicate the ddindex to simulation nodeid index */
-        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
+        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
                       cr->mpi_comm_mysim);
         sfree(buf);
-        
+
         /* Determine the master coordinates and rank.
          * The DD master should be the same node as the master of this sim.
          */
-        for(i=0; i<dd->nnodes; i++)
+        for (i = 0; i < dd->nnodes; i++)
         {
             if (comm->ddindex2simnodeid[i] == 0)
             {
-                ddindex2xyz(dd->nc,i,dd->master_ci);
-                MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
+                ddindex2xyz(dd->nc, i, dd->master_ci);
+                MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
             }
         }
         if (debug)
         {
-            fprintf(debug,"The master rank is %d\n",dd->masterrank);
+            fprintf(debug, "The master rank is %d\n", dd->masterrank);
         }
     }
     else
     {
         /* No Cartesian communicators */
         /* We use the rank in dd->comm->all as DD index */
-        ddindex2xyz(dd->nc,dd->rank,dd->ci);
+        ddindex2xyz(dd->nc, dd->rank, dd->ci);
         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
         dd->masterrank = 0;
         clear_ivec(dd->master_ci);
     }
 #endif
-  
+
     if (fplog)
     {
         fprintf(fplog,
-                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
-                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
+                "Domain decomposition rank %d, coordinates %d %d %d\n\n",
+                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
     }
     if (debug)
     {
         fprintf(debug,
-                "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
-                dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
+                "Domain decomposition rank %d, coordinates %d %d %d\n\n",
+                dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
     }
 }
 
 static void receive_ddindex2simnodeid(t_commrec *cr)
 {
-    gmx_domdec_t *dd;
-    
+    gmx_domdec_t      *dd;
+
     gmx_domdec_comm_t *comm;
-    int  *buf;
-    
-    dd = cr->dd;
+    int               *buf;
+
+    dd   = cr->dd;
     comm = dd->comm;
-    
+
 #ifdef GMX_MPI
     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
     {
-        snew(comm->ddindex2simnodeid,dd->nnodes);
-        snew(buf,dd->nnodes);
+        snew(comm->ddindex2simnodeid, dd->nnodes);
+        snew(buf, dd->nnodes);
         if (cr->duty & DUTY_PP)
         {
-            buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
+            buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
         }
 #ifdef GMX_MPI
         /* Communicate the ddindex to simulation nodeid index */
-        MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
+        MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
                       cr->mpi_comm_mysim);
 #endif
         sfree(buf);
@@ -5847,22 +6071,22 @@ static void receive_ddindex2simnodeid(t_commrec *cr)
 }
 
 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
-                                                     int ncg,int natoms)
+                                                     int ncg, int natoms)
 {
     gmx_domdec_master_t *ma;
-    int i;
+    int                  i;
+
+    snew(ma, 1);
 
-    snew(ma,1);
-    
-    snew(ma->ncg,dd->nnodes);
-    snew(ma->index,dd->nnodes+1);
-    snew(ma->cg,ncg);
-    snew(ma->nat,dd->nnodes);
-    snew(ma->ibuf,dd->nnodes*2);
-    snew(ma->cell_x,DIM);
-    for(i=0; i<DIM; i++)
+    snew(ma->ncg, dd->nnodes);
+    snew(ma->index, dd->nnodes+1);
+    snew(ma->cg, ncg);
+    snew(ma->nat, dd->nnodes);
+    snew(ma->ibuf, dd->nnodes*2);
+    snew(ma->cell_x, DIM);
+    for (i = 0; i < DIM; i++)
     {
-        snew(ma->cell_x[i],dd->nc[i]+1);
+        snew(ma->cell_x[i], dd->nc[i]+1);
     }
 
     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
@@ -5871,30 +6095,30 @@ static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
     }
     else
     {
-        snew(ma->vbuf,natoms);
+        snew(ma->vbuf, natoms);
     }
 
     return ma;
 }
 
-static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
-                               int reorder)
+static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
+                               int gmx_unused reorder)
 {
-    gmx_domdec_t *dd;
+    gmx_domdec_t      *dd;
     gmx_domdec_comm_t *comm;
-    int  i,rank;
-    gmx_bool bDiv[DIM];
-    ivec periods;
+    int                i, rank;
+    gmx_bool           bDiv[DIM];
+    ivec               periods;
 #ifdef GMX_MPI
-    MPI_Comm comm_cart;
+    MPI_Comm           comm_cart;
 #endif
-    
-    dd = cr->dd;
+
+    dd   = cr->dd;
     comm = dd->comm;
-    
+
     if (comm->bCartesianPP)
     {
-        for(i=1; i<DIM; i++)
+        for (i = 1; i < DIM; i++)
         {
             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
         }
@@ -5923,47 +6147,47 @@ static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
         }
         else if (fplog)
         {
-            fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
+            fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
             fprintf(fplog,
                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
         }
     }
-    
+
 #ifdef GMX_MPI
     if (comm->bCartesianPP_PME)
     {
         if (fplog)
         {
-            fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
+            fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
         }
-        
-        for(i=0; i<DIM; i++)
+
+        for (i = 0; i < DIM; i++)
         {
             periods[i] = TRUE;
         }
-        MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
+        MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
                         &comm_cart);
-        
-        MPI_Comm_rank(comm_cart,&rank);
+
+        MPI_Comm_rank(comm_cart, &rank);
         if (MASTERNODE(cr) && rank != 0)
         {
-            gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
+            gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
         }
-        
+
         /* With this assigment we loose the link to the original communicator
          * which will usually be MPI_COMM_WORLD, unless have multisim.
          */
         cr->mpi_comm_mysim = comm_cart;
-        cr->sim_nodeid = rank;
-        
-        MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
-        
+        cr->sim_nodeid     = rank;
+
+        MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
+
         if (fplog)
         {
-            fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
-                    cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
+            fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
+                    cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
         }
-        
+
         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
         {
             cr->duty = DUTY_PP;
@@ -5973,42 +6197,42 @@ static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
         {
             cr->duty = DUTY_PME;
         }
-        
+
         /* Split the sim communicator into PP and PME only nodes */
         MPI_Comm_split(cr->mpi_comm_mysim,
                        cr->duty,
-                       dd_index(comm->ntot,dd->ci),
+                       dd_index(comm->ntot, dd->ci),
                        &cr->mpi_comm_mygroup);
     }
     else
     {
         switch (dd_node_order)
         {
-        case ddnoPP_PME:
-            if (fplog)
-            {
-                fprintf(fplog,"Order of the nodes: PP first, PME last\n");
-            }
-            break;
-        case ddnoINTERLEAVE:
-            /* Interleave the PP-only and PME-only nodes,
-             * as on clusters with dual-core machines this will double
-             * the communication bandwidth of the PME processes
-             * and thus speed up the PP <-> PME and inter PME communication.
-             */
-            if (fplog)
-            {
-                fprintf(fplog,"Interleaving PP and PME nodes\n");
-            }
-            comm->pmenodes = dd_pmenodes(cr);
-            break;
-        case ddnoCARTESIAN:
-            break;
-        default:
-            gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
+            case ddnoPP_PME:
+                if (fplog)
+                {
+                    fprintf(fplog, "Order of the ranks: PP first, PME last\n");
+                }
+                break;
+            case ddnoINTERLEAVE:
+                /* Interleave the PP-only and PME-only nodes,
+                 * as on clusters with dual-core machines this will double
+                 * the communication bandwidth of the PME processes
+                 * and thus speed up the PP <-> PME and inter PME communication.
+                 */
+                if (fplog)
+                {
+                    fprintf(fplog, "Interleaving PP and PME ranks\n");
+                }
+                comm->pmenodes = dd_pmenodes(cr);
+                break;
+            case ddnoCARTESIAN:
+                break;
+            default:
+                gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
         }
-    
-        if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
+
+        if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
         {
             cr->duty = DUTY_PME;
         }
@@ -6016,47 +6240,47 @@ static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
         {
             cr->duty = DUTY_PP;
         }
-        
+
         /* Split the sim communicator into PP and PME only nodes */
         MPI_Comm_split(cr->mpi_comm_mysim,
                        cr->duty,
                        cr->nodeid,
                        &cr->mpi_comm_mygroup);
-        MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
+        MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
     }
 #endif
 
     if (fplog)
     {
-        fprintf(fplog,"This is a %s only node\n\n",
+        fprintf(fplog, "This rank does only %s work.\n\n",
                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
     }
 }
 
-void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
+void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
 {
-    gmx_domdec_t *dd;
+    gmx_domdec_t      *dd;
     gmx_domdec_comm_t *comm;
-    int CartReorder;
-    
-    dd = cr->dd;
+    int                CartReorder;
+
+    dd   = cr->dd;
     comm = dd->comm;
-    
-    copy_ivec(dd->nc,comm->ntot);
-    
-    comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
+
+    copy_ivec(dd->nc, comm->ntot);
+
+    comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
     comm->bCartesianPP_PME = FALSE;
-    
+
     /* Reorder the nodes by default. This might change the MPI ranks.
      * Real reordering is only supported on very few architectures,
      * Blue Gene is one of them.
      */
     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
-    
+
     if (cr->npmenodes > 0)
     {
         /* Split the communicator into a PP and PME part */
-        split_communicator(fplog,cr,dd_node_order,CartReorder);
+        split_communicator(fplog, cr, dd_node_order, CartReorder);
         if (comm->bCartesianPP_PME)
         {
             /* We (possibly) reordered the nodes in split_communicator,
@@ -6068,31 +6292,31 @@ void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
     else
     {
         /* All nodes do PP and PME */
-#ifdef GMX_MPI    
+#ifdef GMX_MPI
         /* We do not require separate communicators */
         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
 #endif
     }
-    
+
     if (cr->duty & DUTY_PP)
     {
         /* Copy or make a new PP communicator */
-        make_pp_communicator(fplog,cr,CartReorder);
+        make_pp_communicator(fplog, cr, CartReorder);
     }
     else
     {
         receive_ddindex2simnodeid(cr);
     }
-    
+
     if (!(cr->duty & DUTY_PME))
     {
         /* Set up the commnuication to our PME node */
-        dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
+        dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
         dd->pme_receive_vir_ener = receive_vir_ener(cr);
         if (debug)
         {
-            fprintf(debug,"My pme_nodeid %d receive ener %d\n",
-                    dd->pme_nodeid,dd->pme_receive_vir_ener);
+            fprintf(debug, "My pme_nodeid %d receive ener %d\n",
+                    dd->pme_nodeid, dd->pme_receive_vir_ener);
         }
     }
     else
@@ -6108,67 +6332,67 @@ void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
     }
 }
 
-static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
+static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
 {
-    real *slb_frac,tot;
-    int  i,n;
+    real  *slb_frac, tot;
+    int    i, n;
     double dbl;
-    
+
     slb_frac = NULL;
     if (nc > 1 && size_string != NULL)
     {
         if (fplog)
         {
-            fprintf(fplog,"Using static load balancing for the %s direction\n",
+            fprintf(fplog, "Using static load balancing for the %s direction\n",
                     dir);
         }
-        snew(slb_frac,nc);
+        snew(slb_frac, nc);
         tot = 0;
-        for (i=0; i<nc; i++)
+        for (i = 0; i < nc; i++)
         {
             dbl = 0;
-            sscanf(size_string,"%lf%n",&dbl,&n);
+            sscanf(size_string, "%lf%n", &dbl, &n);
             if (dbl == 0)
             {
-                gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
+                gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
             }
-            slb_frac[i] = dbl;
+            slb_frac[i]  = dbl;
             size_string += n;
-            tot += slb_frac[i];
+            tot         += slb_frac[i];
         }
         /* Normalize */
         if (fplog)
         {
-            fprintf(fplog,"Relative cell sizes:");
+            fprintf(fplog, "Relative cell sizes:");
         }
-        for (i=0; i<nc; i++)
+        for (i = 0; i < nc; i++)
         {
             slb_frac[i] /= tot;
             if (fplog)
             {
-                fprintf(fplog," %5.3f",slb_frac[i]);
+                fprintf(fplog, " %5.3f", slb_frac[i]);
             }
         }
         if (fplog)
         {
-            fprintf(fplog,"\n");
+            fprintf(fplog, "\n");
         }
     }
-    
+
     return slb_frac;
 }
 
 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
 {
-    int n,nmol,ftype;
+    int                  n, nmol, ftype;
     gmx_mtop_ilistloop_t iloop;
-    t_ilist *il;
-    
-    n = 0;
+    t_ilist             *il;
+
+    n     = 0;
     iloop = gmx_mtop_ilistloop_init(mtop);
-    while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
+    while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
     {
-        for(ftype=0; ftype<F_NRE; ftype++)
+        for (ftype = 0; ftype < F_NRE; ftype++)
         {
             if ((interaction_function[ftype].flags & IF_BOND) &&
                 NRAL(ftype) >  2)
@@ -6176,101 +6400,101 @@ static int multi_body_bondeds_count(gmx_mtop_t *mtop)
                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
             }
         }
-  }
+    }
 
-  return n;
+    return n;
 }
 
-static int dd_nst_env(FILE *fplog,const char *env_var,int def)
+static int dd_getenv(FILE *fplog, const char *env_var, int def)
 {
     char *val;
-    int  nst;
-    
+    int   nst;
+
     nst = def;
     val = getenv(env_var);
     if (val)
     {
-        if (sscanf(val,"%d",&nst) <= 0)
+        if (sscanf(val, "%d", &nst) <= 0)
         {
             nst = 1;
         }
         if (fplog)
         {
-            fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
-                    env_var,val,nst);
+            fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
+                    env_var, val, nst);
         }
     }
-    
+
     return nst;
 }
 
-static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
+static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
 {
     if (MASTER(cr))
     {
-        fprintf(stderr,"\n%s\n",warn_string);
+        fprintf(stderr, "\n%s\n", warn_string);
     }
     if (fplog)
     {
-        fprintf(fplog,"\n%s\n",warn_string);
+        fprintf(fplog, "\n%s\n", warn_string);
     }
 }
 
-static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
-                                  t_inputrec *ir,FILE *fplog)
+static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
+                                  t_inputrec *ir, FILE *fplog)
 {
     if (ir->ePBC == epbcSCREW &&
         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
     {
-        gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
+        gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
     }
 
     if (ir->ns_type == ensSIMPLE)
     {
-        gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
+        gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
     }
 
     if (ir->nstlist == 0)
     {
-        gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
+        gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
     }
 
     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
     {
-        dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
+        dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
     }
 }
 
-static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
+static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
 {
-    int  di,d;
+    int  di, d;
     real r;
 
     r = ddbox->box_size[XX];
-    for(di=0; di<dd->ndim; di++)
+    for (di = 0; di < dd->ndim; di++)
     {
         d = dd->dim[di];
         /* Check using the initial average cell size */
-        r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
+        r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
     }
 
     return r;
 }
 
-static int check_dlb_support(FILE *fplog,t_commrec *cr,
-                             const char *dlb_opt,gmx_bool bRecordLoad,
-                             unsigned long Flags,t_inputrec *ir)
+static int check_dlb_support(FILE *fplog, t_commrec *cr,
+                             const char *dlb_opt, gmx_bool bRecordLoad,
+                             unsigned long Flags, t_inputrec *ir)
 {
     gmx_domdec_t *dd;
-    int  eDLB=-1;
-    char buf[STRLEN];
+    int           eDLB = -1;
+    char          buf[STRLEN];
 
     switch (dlb_opt[0])
     {
-    case 'a': eDLB = edlbAUTO; break;
-    case 'n': eDLB = edlbNO;   break;
-    case 'y': eDLB = edlbYES;  break;
-    default: gmx_incons("Unknown dlb_opt");
+        case 'a': eDLB = edlbAUTO; break;
+        case 'n': eDLB = edlbNO;   break;
+        case 'y': eDLB = edlbYES;  break;
+        default: gmx_incons("Unknown dlb_opt");
     }
 
     if (Flags & MD_RERUN)
@@ -6282,16 +6506,16 @@ static int check_dlb_support(FILE *fplog,t_commrec *cr,
     {
         if (eDLB == edlbYES)
         {
-            sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
-            dd_warning(cr,fplog,buf);
+            sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
+            dd_warning(cr, fplog, buf);
         }
-            
+
         return edlbNO;
     }
 
     if (!bRecordLoad)
     {
-        dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
+        dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
 
         return edlbNO;
     }
@@ -6300,25 +6524,25 @@ static int check_dlb_support(FILE *fplog,t_commrec *cr,
     {
         switch (eDLB)
         {
-                       case edlbNO: 
-                               break;
-                       case edlbAUTO:
-                               dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
-                               eDLB = edlbNO;
-                               break;
-                       case edlbYES:
-                               dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
-                               break;
-                       default:
-                               gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
-                               break;
+            case edlbNO:
+                break;
+            case edlbAUTO:
+                dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
+                eDLB = edlbNO;
+                break;
+            case edlbYES:
+                dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
+                break;
+            default:
+                gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
+                break;
         }
     }
 
     return eDLB;
 }
 
-static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
+static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
 {
     int dim;
 
@@ -6328,9 +6552,9 @@ static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
         /* Decomposition order z,y,x */
         if (fplog)
         {
-            fprintf(fplog,"Using domain decomposition order z, y, x\n");
+            fprintf(fplog, "Using domain decomposition order z, y, x\n");
         }
-        for(dim=DIM-1; dim>=0; dim--)
+        for (dim = DIM-1; dim >= 0; dim--)
         {
             if (dd->nc[dim] > 1)
             {
@@ -6341,7 +6565,7 @@ static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
     else
     {
         /* Decomposition order x,y,z */
-        for(dim=0; dim<DIM; dim++)
+        for (dim = 0; dim < DIM; dim++)
         {
             if (dd->nc[dim] > 1)
             {
@@ -6354,17 +6578,17 @@ static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
 static gmx_domdec_comm_t *init_dd_comm()
 {
     gmx_domdec_comm_t *comm;
-    int  i;
+    int                i;
 
-    snew(comm,1);
-    snew(comm->cggl_flag,DIM*2);
-    snew(comm->cgcm_state,DIM*2);
-    for(i=0; i<DIM*2; i++)
+    snew(comm, 1);
+    snew(comm->cggl_flag, DIM*2);
+    snew(comm->cgcm_state, DIM*2);
+    for (i = 0; i < DIM*2; i++)
     {
         comm->cggl_flag_nalloc[i]  = 0;
         comm->cgcm_state_nalloc[i] = 0;
     }
-    
+
     comm->nalloc_int = 0;
     comm->buf_int    = NULL;
 
@@ -6373,12 +6597,12 @@ static gmx_domdec_comm_t *init_dd_comm()
     comm->n_load_have    = 0;
     comm->n_load_collect = 0;
 
-    for(i=0; i<ddnatNR-ddnatZONE; i++)
+    for (i = 0; i < ddnatNR-ddnatZONE; i++)
     {
         comm->sum_nat[i] = 0;
     }
-    comm->ndecomp = 0;
-    comm->nload   = 0;
+    comm->ndecomp   = 0;
+    comm->nload     = 0;
     comm->load_step = 0;
     comm->load_sum  = 0;
     comm->load_max  = 0;
@@ -6389,62 +6613,62 @@ static gmx_domdec_comm_t *init_dd_comm()
     return comm;
 }
 
-gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
+gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
                                         unsigned long Flags,
                                         ivec nc,
-                                        real comm_distance_min,real rconstr,
-                                        const char *dlb_opt,real dlb_scale,
-                                        const char *sizex,const char *sizey,const char *sizez,
-                                        gmx_mtop_t *mtop,t_inputrec *ir,
-                                        matrix box,rvec *x,
+                                        real comm_distance_min, real rconstr,
+                                        const char *dlb_opt, real dlb_scale,
+                                        const char *sizex, const char *sizey, const char *sizez,
+                                        gmx_mtop_t *mtop, t_inputrec *ir,
+                                        matrix box, rvec *x,
                                         gmx_ddbox_t *ddbox,
-                                        int *npme_x,int *npme_y)
+                                        int *npme_x, int *npme_y)
 {
-    gmx_domdec_t *dd;
+    gmx_domdec_t      *dd;
     gmx_domdec_comm_t *comm;
-    int  recload;
-    int  d,i,j;
-    real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
-    gmx_bool bC;
-    char buf[STRLEN];
-    
+    int                recload;
+    int                d, i, j;
+    real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
+    gmx_bool           bC;
+    char               buf[STRLEN];
+
     if (fplog)
     {
         fprintf(fplog,
-                "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
+                "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
     }
-    
-    snew(dd,1);
+
+    snew(dd, 1);
 
     dd->comm = init_dd_comm();
-    comm = dd->comm;
-    snew(comm->cggl_flag,DIM*2);
-    snew(comm->cgcm_state,DIM*2);
+    comm     = dd->comm;
+    snew(comm->cggl_flag, DIM*2);
+    snew(comm->cgcm_state, DIM*2);
 
     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
-    
-    dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
-    comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
-    comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
-    recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
-    comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
-    comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
-    comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
-    comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
+
+    dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
+    comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
+    comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
+    recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
+    comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
+    comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
+    comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
+    comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
 
     dd->pme_recv_f_alloc = 0;
-    dd->pme_recv_f_buf = NULL;
+    dd->pme_recv_f_buf   = NULL;
 
     if (dd->bSendRecv2 && fplog)
     {
-        fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
+        fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
     }
     if (comm->eFlop)
     {
         if (fplog)
         {
-            fprintf(fplog,"Will load balance based on FLOP count\n");
+            fprintf(fplog, "Will load balance based on FLOP count\n");
         }
         if (comm->eFlop > 1)
         {
@@ -6455,44 +6679,48 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
     else
     {
         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
-                             
+
     }
-    
-    comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
-    
+
+    /* Initialize to GPU share count to 0, might change later */
+    comm->nrank_gpu_shared = 0;
+
+    comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
+
     comm->bDynLoadBal = (comm->eDLB == edlbYES);
     if (fplog)
     {
-        fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
+        fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
     }
-    dd->bGridJump = comm->bDynLoadBal;
-    
+    dd->bGridJump              = comm->bDynLoadBal;
+    comm->bPMELoadBalDLBLimits = FALSE;
+
     if (comm->nstSortCG)
     {
         if (fplog)
         {
             if (comm->nstSortCG == 1)
             {
-                fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
+                fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
             }
             else
             {
-                fprintf(fplog,"Will sort the charge groups every %d steps\n",
+                fprintf(fplog, "Will sort the charge groups every %d steps\n",
                         comm->nstSortCG);
             }
         }
-        snew(comm->sort,1);
+        snew(comm->sort, 1);
     }
     else
     {
         if (fplog)
         {
-            fprintf(fplog,"Will not sort the charge groups\n");
+            fprintf(fplog, "Will not sort the charge groups\n");
         }
     }
 
     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
-    
+
     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
     if (comm->bInterCGBondeds)
     {
@@ -6502,7 +6730,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
     {
         comm->bInterCGMultiBody = FALSE;
     }
-    
+
     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
     dd->bInterCGsettles = inter_charge_group_settles(mtop);
 
@@ -6519,9 +6747,16 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
         comm->cutoff   = ir->rlistlong;
     }
     comm->cutoff_mbody = 0;
-    
+
     comm->cellsize_limit = 0;
-    comm->bBondComm = FALSE;
+    comm->bBondComm      = FALSE;
+
+    /* Atoms should be able to move by up to half the list buffer size (if > 0)
+     * within nstlist steps. Since boundaries are allowed to displace by half
+     * a cell size, DD cells should be at least the size of the list buffer.
+     */
+    comm->cellsize_limit = max(comm->cellsize_limit,
+                               ir->rlistlong - max(ir->rvdw, ir->rcoulomb));
 
     if (comm->bInterCGBondeds)
     {
@@ -6534,42 +6769,42 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
             }
             else
             {
-                comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
+                comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
             }
             r_bonded_limit = comm->cutoff_mbody;
         }
         else if (ir->bPeriodicMols)
         {
             /* Can not easily determine the required cut-off */
-            dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
+            dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
             comm->cutoff_mbody = comm->cutoff/2;
-            r_bonded_limit = comm->cutoff_mbody;
+            r_bonded_limit     = comm->cutoff_mbody;
         }
         else
         {
             if (MASTER(cr))
             {
-                dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
-                                      Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
+                dd_bonded_cg_distance(fplog, mtop, ir, x, box,
+                                      Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
             }
-            gmx_bcast(sizeof(r_2b),&r_2b,cr);
-            gmx_bcast(sizeof(r_mb),&r_mb,cr);
+            gmx_bcast(sizeof(r_2b), &r_2b, cr);
+            gmx_bcast(sizeof(r_mb), &r_mb, cr);
 
             /* We use an initial margin of 10% for the minimum cell size,
              * except when we are just below the non-bonded cut-off.
              */
             if (Flags & MD_DDBONDCOMM)
             {
-                if (max(r_2b,r_mb) > comm->cutoff)
+                if (max(r_2b, r_mb) > comm->cutoff)
                 {
-                    r_bonded       = max(r_2b,r_mb);
-                    r_bonded_limit = 1.1*r_bonded;
+                    r_bonded        = max(r_2b, r_mb);
+                    r_bonded_limit  = 1.1*r_bonded;
                     comm->bBondComm = TRUE;
                 }
                 else
                 {
                     r_bonded       = r_mb;
-                    r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
+                    r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
                 }
                 /* We determine cutoff_mbody later */
             }
@@ -6578,12 +6813,12 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
                 /* No special bonded communication,
                  * simply increase the DD cut-off.
                  */
-                r_bonded_limit     = 1.1*max(r_2b,r_mb);
+                r_bonded_limit     = 1.1*max(r_2b, r_mb);
                 comm->cutoff_mbody = r_bonded_limit;
-                comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
+                comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
             }
         }
-        comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
+        comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
         if (fplog)
         {
             fprintf(fplog,
@@ -6595,7 +6830,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
     if (dd->bInterCGcons && rconstr <= 0)
     {
         /* There is a cell size limit due to the constraints (P-LINCS) */
-        rconstr = constr_r_max(fplog,mtop,ir);
+        rconstr = constr_r_max(fplog, mtop, ir);
         if (fplog)
         {
             fprintf(fplog,
@@ -6603,7 +6838,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
                     rconstr);
             if (rconstr > comm->cellsize_limit)
             {
-                fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
+                fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
             }
         }
     }
@@ -6617,77 +6852,77 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
                 rconstr);
     }
-    comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
+    comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
 
     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
 
     if (nc[XX] > 0)
     {
-        copy_ivec(nc,dd->nc);
-        set_dd_dim(fplog,dd);
-        set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
+        copy_ivec(nc, dd->nc);
+        set_dd_dim(fplog, dd);
+        set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
 
         if (cr->npmenodes == -1)
         {
             cr->npmenodes = 0;
         }
-        acs = average_cellsize_min(dd,ddbox);
+        acs = average_cellsize_min(dd, ddbox);
         if (acs < comm->cellsize_limit)
         {
             if (fplog)
             {
-                fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
+                fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
             }
-            gmx_fatal_collective(FARGS,cr,NULL,
+            gmx_fatal_collective(FARGS, cr, NULL,
                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
-                                 acs,comm->cellsize_limit);
+                                 acs, comm->cellsize_limit);
         }
     }
     else
     {
-        set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
+        set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
 
         /* We need to choose the optimal DD grid and possibly PME nodes */
-        limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
-                               comm->eDLB!=edlbNO,dlb_scale,
-                               comm->cellsize_limit,comm->cutoff,
-                               comm->bInterCGBondeds,comm->bInterCGMultiBody);
-        
+        limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
+                               comm->eDLB != edlbNO, dlb_scale,
+                               comm->cellsize_limit, comm->cutoff,
+                               comm->bInterCGBondeds);
+
         if (dd->nc[XX] == 0)
         {
             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
-            sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
+            sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
                     !bC ? "-rdd" : "-rcon",
-                    comm->eDLB!=edlbNO ? " or -dds" : "",
+                    comm->eDLB != edlbNO ? " or -dds" : "",
                     bC ? " or your LINCS settings" : "");
 
-            gmx_fatal_collective(FARGS,cr,NULL,
-                                 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
+            gmx_fatal_collective(FARGS, cr, NULL,
+                                 "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
                                  "%s\n"
                                  "Look in the log file for details on the domain decomposition",
-                                 cr->nnodes-cr->npmenodes,limit,buf);
+                                 cr->nnodes-cr->npmenodes, limit, buf);
         }
-        set_dd_dim(fplog,dd);
+        set_dd_dim(fplog, dd);
     }
 
     if (fplog)
     {
         fprintf(fplog,
-                "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
-                dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
+                "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
+                dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
     }
-    
+
     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
     if (cr->nnodes - dd->nnodes != cr->npmenodes)
     {
-        gmx_fatal_collective(FARGS,cr,NULL,
-                             "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
-                             dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
+        gmx_fatal_collective(FARGS, cr, NULL,
+                             "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
+                             dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
     }
     if (cr->npmenodes > dd->nnodes)
     {
-        gmx_fatal_collective(FARGS,cr,NULL,
-                             "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
+        gmx_fatal_collective(FARGS, cr, NULL,
+                             "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
     }
     if (cr->npmenodes > 0)
     {
@@ -6698,7 +6933,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
         comm->npmenodes = dd->nnodes;
     }
 
-    if (EEL_PME(ir->coulombtype))
+    if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
     {
         /* The following choices should match those
          * in comm_cost_est in domdec_setup.c.
@@ -6732,11 +6967,11 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
                 comm->npmenodes_x = comm->npmenodes;
                 comm->npmenodes_y = 1;
             }
-        }    
+        }
         if (fplog)
         {
-            fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
-                    comm->npmenodes_x,comm->npmenodes_y,1);
+            fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
+                    comm->npmenodes_x, comm->npmenodes_y, 1);
         }
     }
     else
@@ -6745,19 +6980,19 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
         comm->npmenodes_x   = 0;
         comm->npmenodes_y   = 0;
     }
-    
+
     /* Technically we don't need both of these,
      * but it simplifies code not having to recalculate it.
      */
     *npme_x = comm->npmenodes_x;
     *npme_y = comm->npmenodes_y;
-        
-    snew(comm->slb_frac,DIM);
+
+    snew(comm->slb_frac, DIM);
     if (comm->eDLB == edlbNO)
     {
-        comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
-        comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
-        comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
+        comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
+        comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
+        comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
     }
 
     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
@@ -6768,17 +7003,17 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
              * the minimum and the maximum,
              * since the extra communication cost is nearly zero.
              */
-            acs = average_cellsize_min(dd,ddbox);
+            acs                = average_cellsize_min(dd, ddbox);
             comm->cutoff_mbody = 0.5*(r_bonded + acs);
             if (comm->eDLB != edlbNO)
             {
                 /* Check if this does not limit the scaling */
-                comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
+                comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
             }
             if (!comm->bBondComm)
             {
                 /* Without bBondComm do not go beyond the n.b. cut-off */
-                comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
+                comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
                 if (comm->cellsize_limit >= comm->cutoff)
                 {
                     /* We don't loose a lot of efficieny
@@ -6790,7 +7025,7 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
                 }
             }
             /* Check if we did not end up below our original limit */
-            comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
+            comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
 
             if (comm->cutoff_mbody > comm->cellsize_limit)
             {
@@ -6802,18 +7037,18 @@ gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
 
     if (debug)
     {
-        fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
+        fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
                 "cellsize limit %f\n",
-                comm->bBondComm,comm->cellsize_limit);
+                comm->bBondComm, comm->cellsize_limit);
     }
-    
+
     if (MASTER(cr))
     {
-        check_dd_restrictions(cr,dd,ir,fplog);
+        check_dd_restrictions(cr, dd, ir, fplog);
     }
 
     comm->partition_step = INT_MIN;
-    dd->ddp_count = 0;
+    dd->ddp_count        = 0;
 
     clear_dd_cycle_counts(dd);
 
@@ -6825,40 +7060,40 @@ static void set_dlb_limits(gmx_domdec_t *dd)
 {
     int d;
 
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
-        dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
+        dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
         dd->comm->cellsize_min[dd->dim[d]] =
             dd->comm->cellsize_min_dlb[dd->dim[d]];
     }
 }
 
 
-static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
+static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
 {
-    gmx_domdec_t *dd;
+    gmx_domdec_t      *dd;
     gmx_domdec_comm_t *comm;
-    real cellsize_min;
-    int  d,nc,i;
-    char buf[STRLEN];
-    
-    dd = cr->dd;
+    real               cellsize_min;
+    int                d, nc, i;
+    char               buf[STRLEN];
+
+    dd   = cr->dd;
     comm = dd->comm;
-    
+
     if (fplog)
     {
-        fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
+        fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
     }
 
     cellsize_min = comm->cellsize_min[dd->dim[0]];
-    for(d=1; d<dd->ndim; d++)
+    for (d = 1; d < dd->ndim; d++)
     {
-        cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
+        cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
     }
 
     if (cellsize_min < comm->cellsize_limit*1.05)
     {
-        dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
+        dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
 
         /* Change DLB from "auto" to "no". */
         comm->eDLB = edlbNO;
@@ -6866,24 +7101,24 @@ static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
         return;
     }
 
-    dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
+    dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
     comm->bDynLoadBal = TRUE;
-    dd->bGridJump = TRUE;
-    
+    dd->bGridJump     = TRUE;
+
     set_dlb_limits(dd);
 
     /* We can set the required cell size info here,
      * so we do not need to communicate this.
      * The grid is completely uniform.
      */
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
         if (comm->root[d])
         {
             comm->load[d].sum_m = comm->load[d].sum;
 
             nc = dd->nc[dd->dim[d]];
-            for(i=0; i<nc; i++)
+            for (i = 0; i < nc; i++)
             {
                 comm->root[d]->cell_f[i]    = i/(real)nc;
                 if (d > 0)
@@ -6899,12 +7134,12 @@ static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
 
 static char *init_bLocalCG(gmx_mtop_t *mtop)
 {
-    int  ncg,cg;
+    int   ncg, cg;
     char *bLocalCG;
-    
+
     ncg = ncg_mtop(mtop);
-    snew(bLocalCG,ncg);
-    for(cg=0; cg<ncg; cg++)
+    snew(bLocalCG, ncg);
+    for (cg = 0; cg < ncg; cg++)
     {
         bLocalCG[cg] = FALSE;
     }
@@ -6913,15 +7148,15 @@ static char *init_bLocalCG(gmx_mtop_t *mtop)
 }
 
 void dd_init_bondeds(FILE *fplog,
-                     gmx_domdec_t *dd,gmx_mtop_t *mtop,
-                     gmx_vsite_t *vsite,gmx_constr_t constr,
-                     t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
+                     gmx_domdec_t *dd, gmx_mtop_t *mtop,
+                     gmx_vsite_t *vsite,
+                     t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
 {
     gmx_domdec_comm_t *comm;
-    gmx_bool bBondComm;
-    int  d;
+    gmx_bool           bBondComm;
+    int                d;
 
-    dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
+    dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
 
     comm = dd->comm;
 
@@ -6930,7 +7165,7 @@ void dd_init_bondeds(FILE *fplog,
         /* Communicate atoms beyond the cut-off for bonded interactions */
         comm = dd->comm;
 
-        comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
+        comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
 
         comm->bLocalCG = init_bLocalCG(mtop);
     }
@@ -6942,16 +7177,16 @@ void dd_init_bondeds(FILE *fplog,
     }
 }
 
-static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
+static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
                               t_inputrec *ir,
-                              gmx_bool bDynLoadBal,real dlb_scale,
+                              gmx_bool bDynLoadBal, real dlb_scale,
                               gmx_ddbox_t *ddbox)
 {
     gmx_domdec_comm_t *comm;
-    int  d;
-    ivec np;
-    real limit,shrink;
-    char buf[64];
+    int                d;
+    ivec               np;
+    real               limit, shrink;
+    char               buf[64];
 
     if (fplog == NULL)
     {
@@ -6962,16 +7197,16 @@ static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
 
     if (bDynLoadBal)
     {
-        fprintf(fplog,"The maximum number of communication pulses is:");
-        for(d=0; d<dd->ndim; d++)
+        fprintf(fplog, "The maximum number of communication pulses is:");
+        for (d = 0; d < dd->ndim; d++)
         {
-            fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
+            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
         }
-        fprintf(fplog,"\n");
-        fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
-        fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
-        fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
-        for(d=0; d<DIM; d++)
+        fprintf(fplog, "\n");
+        fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
+        fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
+        fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
+        for (d = 0; d < DIM; d++)
         {
             if (dd->nc[d] > 1)
             {
@@ -6985,36 +7220,37 @@ static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
                         comm->cellsize_min_dlb[d]/
                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
                 }
-                fprintf(fplog," %c %.2f",dim2char(d),shrink);
+                fprintf(fplog, " %c %.2f", dim2char(d), shrink);
             }
         }
-        fprintf(fplog,"\n");
+        fprintf(fplog, "\n");
     }
     else
     {
-        set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
-        fprintf(fplog,"The initial number of communication pulses is:");
-        for(d=0; d<dd->ndim; d++)
+        set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
+        fprintf(fplog, "The initial number of communication pulses is:");
+        for (d = 0; d < dd->ndim; d++)
         {
-            fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
+            fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
         }
-        fprintf(fplog,"\n");
-        fprintf(fplog,"The initial domain decomposition cell size is:");
-        for(d=0; d<DIM; d++) {
+        fprintf(fplog, "\n");
+        fprintf(fplog, "The initial domain decomposition cell size is:");
+        for (d = 0; d < DIM; d++)
+        {
             if (dd->nc[d] > 1)
             {
-                fprintf(fplog," %c %.2f nm",
-                        dim2char(d),dd->comm->cellsize_min[d]);
+                fprintf(fplog, " %c %.2f nm",
+                        dim2char(d), dd->comm->cellsize_min[d]);
             }
         }
-        fprintf(fplog,"\n\n");
+        fprintf(fplog, "\n\n");
     }
-    
+
     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
     {
-        fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
-        fprintf(fplog,"%40s  %-7s %6.3f nm\n",
-                "non-bonded interactions","",comm->cutoff);
+        fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
+        fprintf(fplog, "%40s  %-7s %6.3f nm\n",
+                "non-bonded interactions", "", comm->cutoff);
 
         if (bDynLoadBal)
         {
@@ -7022,61 +7258,61 @@ static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
         }
         else
         {
-            if (dynamic_dd_box(ddbox,ir))
+            if (dynamic_dd_box(ddbox, ir))
             {
-                fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
+                fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
             }
             limit = dd->comm->cellsize_min[XX];
-            for(d=1; d<DIM; d++)
+            for (d = 1; d < DIM; d++)
             {
-                limit = min(limit,dd->comm->cellsize_min[d]);
+                limit = min(limit, dd->comm->cellsize_min[d]);
             }
         }
 
         if (comm->bInterCGBondeds)
         {
-            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
-                    "two-body bonded interactions","(-rdd)",
-                    max(comm->cutoff,comm->cutoff_mbody));
-            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
-                    "multi-body bonded interactions","(-rdd)",
-                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
+            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
+                    "two-body bonded interactions", "(-rdd)",
+                    max(comm->cutoff, comm->cutoff_mbody));
+            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
+                    "multi-body bonded interactions", "(-rdd)",
+                    (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
         }
         if (dd->vsite_comm)
         {
-            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
-                    "virtual site constructions","(-rcon)",limit);
+            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
+                    "virtual site constructions", "(-rcon)", limit);
         }
         if (dd->constraint_comm)
         {
-            sprintf(buf,"atoms separated by up to %d constraints",
+            sprintf(buf, "atoms separated by up to %d constraints",
                     1+ir->nProjOrder);
-            fprintf(fplog,"%40s  %-7s %6.3f nm\n",
-                    buf,"(-rcon)",limit);
+            fprintf(fplog, "%40s  %-7s %6.3f nm\n",
+                    buf, "(-rcon)", limit);
         }
-        fprintf(fplog,"\n");
+        fprintf(fplog, "\n");
     }
-    
+
     fflush(fplog);
 }
 
-static void set_cell_limits_dlb(gmx_domdec_t *dd,
-                                real dlb_scale,
-                                const t_inputrec *ir,
+static void set_cell_limits_dlb(gmx_domdec_t      *dd,
+                                real               dlb_scale,
+                                const t_inputrec  *ir,
                                 const gmx_ddbox_t *ddbox)
 {
     gmx_domdec_comm_t *comm;
-    int  d,dim,npulse,npulse_d_max,npulse_d;
-    gmx_bool bNoCutOff;
+    int                d, dim, npulse, npulse_d_max, npulse_d;
+    gmx_bool           bNoCutOff;
 
     comm = dd->comm;
 
     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
 
     /* Determine the maximum number of comm. pulses in one dimension */
-        
-    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
-        
+
+    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
+
     /* Determine the maximum required number of grid pulses */
     if (comm->cellsize_limit >= comm->cutoff)
     {
@@ -7096,38 +7332,38 @@ static void set_cell_limits_dlb(gmx_domdec_t *dd,
     else
     {
         /* There is no cell size limit */
-        npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
+        npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
     }
 
     if (!bNoCutOff && npulse > 1)
     {
         /* See if we can do with less pulses, based on dlb_scale */
         npulse_d_max = 0;
-        for(d=0; d<dd->ndim; d++)
+        for (d = 0; d < dd->ndim; d++)
         {
-            dim = dd->dim[d];
+            dim      = dd->dim[d];
             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
-            npulse_d_max = max(npulse_d_max,npulse_d);
+            npulse_d_max = max(npulse_d_max, npulse_d);
         }
-        npulse = min(npulse,npulse_d_max);
+        npulse = min(npulse, npulse_d_max);
     }
 
     /* This env var can override npulse */
-    d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
+    d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
     if (d > 0)
     {
         npulse = d;
     }
 
-    comm->maxpulse = 1;
+    comm->maxpulse       = 1;
     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
-        comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
+        comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
-        snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
-        comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
+        snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
+        comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
         {
             comm->bVacDLBNoLimit = FALSE;
@@ -7140,9 +7376,9 @@ static void set_cell_limits_dlb(gmx_domdec_t *dd,
         comm->cellsize_limit = max(comm->cellsize_limit,
                                    comm->cutoff/comm->maxpulse);
     }
-    comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
+    comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
     /* Set the minimum cell size for each DD dimension */
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
         if (comm->bVacDLBNoLimit ||
             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
@@ -7157,7 +7393,7 @@ static void set_cell_limits_dlb(gmx_domdec_t *dd,
     }
     if (comm->cutoff_mbody <= 0)
     {
-        comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
+        comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
     }
     if (comm->bDynLoadBal)
     {
@@ -7165,25 +7401,24 @@ static void set_cell_limits_dlb(gmx_domdec_t *dd,
     }
 }
 
-gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
+gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
 {
     /* If each molecule is a single charge group
      * or we use domain decomposition for each periodic dimension,
      * we do not need to take pbc into account for the bonded interactions.
      */
     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
-            !(dd->nc[XX]>1 &&
-              dd->nc[YY]>1 &&
-              (dd->nc[ZZ]>1 || ePBC==epbcXY)));
+            !(dd->nc[XX] > 1 &&
+              dd->nc[YY] > 1 &&
+              (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
 }
 
-void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
-                       t_inputrec *ir,t_forcerec *fr,
-                       gmx_ddbox_t *ddbox)
+void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
+                       t_inputrec *ir, gmx_ddbox_t *ddbox)
 {
     gmx_domdec_comm_t *comm;
-    int  natoms_tot;
-    real vol_frac;
+    int                natoms_tot;
+    real               vol_frac;
 
     comm = dd->comm;
 
@@ -7194,15 +7429,15 @@ void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
     comm->nth = gmx_omp_nthreads_get(emntDomdec);
     if (comm->nth > 1)
     {
-        snew(comm->dth,comm->nth);
+        snew(comm->dth, comm->nth);
     }
 
-    if (EEL_PME(ir->coulombtype))
+    if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
     {
-        init_ddpme(dd,&comm->ddpme[0],0);
+        init_ddpme(dd, &comm->ddpme[0], 0);
         if (comm->npmedecompdim >= 2)
         {
-            init_ddpme(dd,&comm->ddpme[1],1);
+            init_ddpme(dd, &comm->ddpme[1], 1);
         }
     }
     else
@@ -7210,28 +7445,28 @@ void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
         comm->npmenodes = 0;
         if (dd->pme_nodeid >= 0)
         {
-            gmx_fatal_collective(FARGS,NULL,dd,
-                                 "Can not have separate PME nodes without PME electrostatics");
+            gmx_fatal_collective(FARGS, NULL, dd,
+                                 "Can not have separate PME ranks without PME electrostatics");
         }
     }
-        
+
     if (debug)
     {
-        fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
+        fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
     }
     if (comm->eDLB != edlbNO)
     {
-        set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
+        set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
     }
-    
-    print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
+
+    print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
     if (comm->eDLB == edlbAUTO)
     {
         if (fplog)
         {
-            fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
+            fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
         }
-        print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
+        print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
     }
 
     if (ir->ePBC == epbcNONE)
@@ -7241,39 +7476,40 @@ void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
     else
     {
         vol_frac =
-            (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
+            (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
     }
     if (debug)
     {
-        fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
+        fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
     }
     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
-   
-    dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
+
+    dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
 }
 
-gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
-                          real cutoff_req)
+static gmx_bool test_dd_cutoff(t_commrec *cr,
+                               t_state *state, t_inputrec *ir,
+                               real cutoff_req)
 {
     gmx_domdec_t *dd;
-    gmx_ddbox_t ddbox;
-    int d,dim,np;
-    real inv_cell_size;
-    int LocallyLimited;
+    gmx_ddbox_t   ddbox;
+    int           d, dim, np;
+    real          inv_cell_size;
+    int           LocallyLimited;
 
     dd = cr->dd;
 
-    set_ddbox(dd,FALSE,cr,ir,state->box,
-              TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
+    set_ddbox(dd, FALSE, cr, ir, state->box,
+              TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
 
     LocallyLimited = 0;
 
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
         dim = dd->dim[d];
 
         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
-        if (dynamic_dd_box(&ddbox,ir))
+        if (dynamic_dd_box(&ddbox, ir))
         {
             inv_cell_size *= DD_PRES_SCALE_MARGIN;
         }
@@ -7301,12 +7537,16 @@ gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
 
     if (dd->comm->eDLB != edlbNO)
     {
-        if (check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
+        /* If DLB is not active yet, we don't need to check the grid jumps.
+         * Actually we shouldn't, because then the grid jump data is not set.
+         */
+        if (dd->comm->bDynLoadBal &&
+            check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
         {
-            LocallyLimited = 1; 
+            LocallyLimited = 1;
         }
 
-        gmx_sumi(1,&LocallyLimited,cr);
+        gmx_sumi(1, &LocallyLimited, cr);
 
         if (LocallyLimited > 0)
         {
@@ -7314,54 +7554,80 @@ gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
         }
     }
 
-    dd->comm->cutoff = cutoff_req;
-
     return TRUE;
 }
 
+gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
+                          real cutoff_req)
+{
+    gmx_bool bCutoffAllowed;
+
+    bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
+
+    if (bCutoffAllowed)
+    {
+        cr->dd->comm->cutoff = cutoff_req;
+    }
+
+    return bCutoffAllowed;
+}
+
+void change_dd_dlb_cutoff_limit(t_commrec *cr)
+{
+    gmx_domdec_comm_t *comm;
+
+    comm = cr->dd->comm;
+
+    /* Turn on the DLB limiting (might have been on already) */
+    comm->bPMELoadBalDLBLimits = TRUE;
+
+    /* Change the cut-off limit */
+    comm->PMELoadBal_max_cutoff = comm->cutoff;
+}
+
 static void merge_cg_buffers(int ncell,
                              gmx_domdec_comm_dim_t *cd, int pulse,
                              int  *ncg_cell,
                              int  *index_gl, int  *recv_i,
                              rvec *cg_cm,    rvec *recv_vr,
                              int *cgindex,
-                             cginfo_mb_t *cginfo_mb,int *cginfo)
+                             cginfo_mb_t *cginfo_mb, int *cginfo)
 {
-    gmx_domdec_ind_t *ind,*ind_p;
-    int p,cell,c,cg,cg0,cg1,cg_gl,nat;
-    int shift,shift_at;
-    
+    gmx_domdec_ind_t *ind, *ind_p;
+    int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
+    int               shift, shift_at;
+
     ind = &cd->ind[pulse];
-    
+
     /* First correct the already stored data */
     shift = ind->nrecv[ncell];
-    for(cell=ncell-1; cell>=0; cell--)
+    for (cell = ncell-1; cell >= 0; cell--)
     {
         shift -= ind->nrecv[cell];
         if (shift > 0)
         {
             /* Move the cg's present from previous grid pulses */
-            cg0 = ncg_cell[ncell+cell];
-            cg1 = ncg_cell[ncell+cell+1];
+            cg0                = ncg_cell[ncell+cell];
+            cg1                = ncg_cell[ncell+cell+1];
             cgindex[cg1+shift] = cgindex[cg1];
-            for(cg=cg1-1; cg>=cg0; cg--)
+            for (cg = cg1-1; cg >= cg0; cg--)
             {
                 index_gl[cg+shift] = index_gl[cg];
-                copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
+                copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
                 cgindex[cg+shift] = cgindex[cg];
-                cginfo[cg+shift] = cginfo[cg];
+                cginfo[cg+shift]  = cginfo[cg];
             }
             /* Correct the already stored send indices for the shift */
-            for(p=1; p<=pulse; p++)
+            for (p = 1; p <= pulse; p++)
             {
                 ind_p = &cd->ind[p];
-                cg0 = 0;
-                for(c=0; c<cell; c++)
+                cg0   = 0;
+                for (c = 0; c < cell; c++)
                 {
                     cg0 += ind_p->nsend[c];
                 }
                 cg1 = cg0 + ind_p->nsend[cell];
-                for(cg=cg0; cg<cg1; cg++)
+                for (cg = cg0; cg < cg1; cg++)
                 {
                     ind_p->index[cg] += shift;
                 }
@@ -7370,50 +7636,51 @@ static void merge_cg_buffers(int ncell,
     }
 
     /* Merge in the communicated buffers */
-    shift = 0;
+    shift    = 0;
     shift_at = 0;
-    cg0 = 0;
-    for(cell=0; cell<ncell; cell++)
+    cg0      = 0;
+    for (cell = 0; cell < ncell; cell++)
     {
         cg1 = ncg_cell[ncell+cell+1] + shift;
         if (shift_at > 0)
         {
             /* Correct the old cg indices */
-            for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
+            for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
             {
                 cgindex[cg+1] += shift_at;
             }
         }
-        for(cg=0; cg<ind->nrecv[cell]; cg++)
+        for (cg = 0; cg < ind->nrecv[cell]; cg++)
         {
             /* Copy this charge group from the buffer */
             index_gl[cg1] = recv_i[cg0];
-            copy_rvec(recv_vr[cg0],cg_cm[cg1]);
+            copy_rvec(recv_vr[cg0], cg_cm[cg1]);
             /* Add it to the cgindex */
-            cg_gl = index_gl[cg1];
-            cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
-            nat = GET_CGINFO_NATOMS(cginfo[cg1]);
+            cg_gl          = index_gl[cg1];
+            cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
+            nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
             cgindex[cg1+1] = cgindex[cg1] + nat;
             cg0++;
             cg1++;
             shift_at += nat;
         }
-        shift += ind->nrecv[cell];
+        shift                 += ind->nrecv[cell];
         ncg_cell[ncell+cell+1] = cg1;
     }
 }
 
 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
-                               int nzone,int cg0,const int *cgindex)
+                               int nzone, int cg0, const int *cgindex)
 {
-    int cg,zone,p;
-    
+    int cg, zone, p;
+
     /* Store the atom block boundaries for easy copying of communication buffers
      */
     cg = cg0;
-    for(zone=0; zone<nzone; zone++)
+    for (zone = 0; zone < nzone; zone++)
     {
-        for(p=0; p<cd->np; p++) {
+        for (p = 0; p < cd->np; p++)
+        {
             cd->ind[p].cell2at0[zone] = cgindex[cg];
             cg += cd->ind[p].nrecv[zone];
             cd->ind[p].cell2at1[zone] = cgindex[cg];
@@ -7421,13 +7688,13 @@ static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
     }
 }
 
-static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
+static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
 {
-    int  i;
+    int      i;
     gmx_bool bMiss;
 
     bMiss = FALSE;
-    for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
+    for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
     {
         if (!bLocalCG[link->a[i]])
         {
@@ -7454,9 +7721,9 @@ set_dd_corners(const gmx_domdec_t *dd,
                gmx_bool bDistMB,
                dd_corners_t *c)
 {
-    const gmx_domdec_comm_t *comm;
+    const gmx_domdec_comm_t  *comm;
     const gmx_domdec_zones_t *zones;
-    int i,j;
+    int i, j;
 
     comm = dd->comm;
 
@@ -7481,29 +7748,29 @@ set_dd_corners(const gmx_domdec_t *dd,
         c->c[1][1] = comm->cell_x0[dim1];
         if (dd->bGridJump)
         {
-            c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
+            c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
             if (bDistMB)
             {
                 /* For the multi-body distance we need the maximum */
-                c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
+                c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
             }
         }
         /* Set the upper-right corner for rounding */
         c->cr0 = comm->cell_x1[dim0];
-        
+
         if (dd->ndim >= 3)
         {
             dim2 = dd->dim[2];
-            for(j=0; j<4; j++)
+            for (j = 0; j < 4; j++)
             {
                 c->c[2][j] = comm->cell_x0[dim2];
             }
             if (dd->bGridJump)
             {
                 /* Use the maximum of the i-cells that see a j-cell */
-                for(i=0; i<zones->nizone; i++)
+                for (i = 0; i < zones->nizone; i++)
                 {
-                    for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
+                    for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
                     {
                         if (j >= 4)
                         {
@@ -7517,16 +7784,16 @@ set_dd_corners(const gmx_domdec_t *dd,
                 {
                     /* For the multi-body distance we need the maximum */
                     c->bc[2] = comm->cell_x0[dim2];
-                    for(i=0; i<2; i++)
+                    for (i = 0; i < 2; i++)
                     {
-                        for(j=0; j<2; j++)
+                        for (j = 0; j < 2; j++)
                         {
-                            c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
+                            c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
                         }
                     }
                 }
             }
-            
+
             /* Set the upper-right corner for rounding */
             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
              * Only cell (0,0,0) can see cell 7 (1,1,1)
@@ -7535,11 +7802,11 @@ set_dd_corners(const gmx_domdec_t *dd,
             c->cr1[3] = comm->cell_x1[dim1];
             if (dd->bGridJump)
             {
-                c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
+                c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
                 if (bDistMB)
                 {
                     /* For the multi-body distance we need the maximum */
-                    c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
+                    c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
                 }
             }
         }
@@ -7577,13 +7844,13 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
                    int *nsend_z_ptr)
 {
     gmx_domdec_comm_t *comm;
-    gmx_bool bScrew;
-    gmx_bool bDistMB_pulse;
-    int  cg,i;
-    real r2,rb2,r,tric_sh;
-    rvec rn,rb;
-    int  dimd;
-    int  nsend_z,nsend,nat;
+    gmx_bool           bScrew;
+    gmx_bool           bDistMB_pulse;
+    int                cg, i;
+    real               r2, rb2, r, tric_sh;
+    rvec               rn, rb;
+    int                dimd;
+    int                nsend_z, nsend, nat;
 
     comm = dd->comm;
 
@@ -7595,7 +7862,7 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
     nsend   = *nsend_ptr;
     nat     = *nat_ptr;
 
-    for(cg=cg0; cg<cg1; cg++)
+    for (cg = cg0; cg < cg1; cg++)
     {
         r2  = 0;
         rb2 = 0;
@@ -7656,7 +7923,7 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
             {
                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
-                for(i=dim0+1; i<DIM; i++)
+                for (i = dim0+1; i < DIM; i++)
                 {
                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
                 }
@@ -7664,12 +7931,12 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
                 if (bDistMB_pulse)
                 {
                     rb[dim0] = rn[dim0];
-                    rb2 = r2;
+                    rb2      = r2;
                 }
                 /* Take care that the cell planes along dim0 might not
                  * be orthogonal to those along dim1 and dim2.
                  */
-                for(i=1; i<=dim_ind; i++)
+                for (i = 1; i <= dim_ind; i++)
                 {
                     dimd = dd->dim[i];
                     if (normal[dim0][dimd] > 0)
@@ -7685,8 +7952,8 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
             {
                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
-                tric_sh = 0;
-                for(i=dim1+1; i<DIM; i++)
+                tric_sh   = 0;
+                for (i = dim1+1; i < DIM; i++)
                 {
                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
                 }
@@ -7729,8 +7996,8 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
             }
             /* The distance along the communication direction */
             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
-            tric_sh = 0;
-            for(i=dim+1; i<DIM; i++)
+            tric_sh  = 0;
+            for (i = dim+1; i < DIM; i++)
             {
                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
             }
@@ -7763,36 +8030,36 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
                 }
             }
         }
-        
+
         if (r2 < r_comm2 ||
             (bDistBonded &&
              ((bDistMB && rb2 < r_bcomm2) ||
               (bDist2B && r2  < r_bcomm2)) &&
              (!bBondComm ||
               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
-               missing_link(comm->cglink,index_gl[cg],
+               missing_link(comm->cglink, index_gl[cg],
                             comm->bLocalCG)))))
         {
             /* Make an index to the local charge groups */
             if (nsend+1 > ind->nalloc)
             {
                 ind->nalloc = over_alloc_large(nsend+1);
-                srenew(ind->index,ind->nalloc);
+                srenew(ind->index, ind->nalloc);
             }
             if (nsend+1 > *ibuf_nalloc)
             {
                 *ibuf_nalloc = over_alloc_large(nsend+1);
-                srenew(*ibuf,*ibuf_nalloc);
+                srenew(*ibuf, *ibuf_nalloc);
             }
             ind->index[nsend] = cg;
-            (*ibuf)[nsend] = index_gl[cg];
+            (*ibuf)[nsend]    = index_gl[cg];
             nsend_z++;
-            vec_rvec_check_alloc(vbuf,nsend+1);
-            
+            vec_rvec_check_alloc(vbuf, nsend+1);
+
             if (dd->ci[dim] == 0)
             {
                 /* Correct cg_cm for pbc */
-                rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
+                rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
                 if (bScrew)
                 {
                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
@@ -7801,7 +8068,7 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
             }
             else
             {
-                copy_rvec(cg_cm[cg],vbuf->v[nsend]);
+                copy_rvec(cg_cm[cg], vbuf->v[nsend]);
             }
             nsend++;
             nat += cgindex[cg+1] - cgindex[cg];
@@ -7814,55 +8081,55 @@ get_zone_pulse_cgs(gmx_domdec_t *dd,
 }
 
 static void setup_dd_communication(gmx_domdec_t *dd,
-                                   matrix box,gmx_ddbox_t *ddbox,
-                                   t_forcerec *fr,t_state *state,rvec **f)
-{
-    int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
-    int nzone,nzone_send,zone,zonei,cg0,cg1;
-    int c,i,j,cg,cg_gl,nrcg;
-    int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
-    gmx_domdec_comm_t *comm;
-    gmx_domdec_zones_t *zones;
+                                   matrix box, gmx_ddbox_t *ddbox,
+                                   t_forcerec *fr, t_state *state, rvec **f)
+{
+    int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
+    int                    nzone, nzone_send, zone, zonei, cg0, cg1;
+    int                    c, i, j, cg, cg_gl, nrcg;
+    int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
+    gmx_domdec_comm_t     *comm;
+    gmx_domdec_zones_t    *zones;
     gmx_domdec_comm_dim_t *cd;
-    gmx_domdec_ind_t *ind;
-    cginfo_mb_t *cginfo_mb;
-    gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
-    real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
-    dd_corners_t corners;
-    ivec tric_dist;
-    rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
-    real skew_fac2_d,skew_fac_01;
-    rvec sf2_round;
-    int  nsend,nat;
-    int  th;
-    
+    gmx_domdec_ind_t      *ind;
+    cginfo_mb_t           *cginfo_mb;
+    gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
+    real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
+    dd_corners_t           corners;
+    ivec                   tric_dist;
+    rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
+    real                   skew_fac2_d, skew_fac_01;
+    rvec                   sf2_round;
+    int                    nsend, nat;
+    int                    th;
+
     if (debug)
     {
-        fprintf(debug,"Setting up DD communication\n");
+        fprintf(debug, "Setting up DD communication\n");
     }
-    
+
     comm  = dd->comm;
 
     switch (fr->cutoff_scheme)
     {
-    case ecutsGROUP:
-        cg_cm = fr->cg_cm;
-        break;
-    case ecutsVERLET:
-        cg_cm = state->x;
-        break;
-    default:
-        gmx_incons("unimplemented");
-        cg_cm = NULL;
+        case ecutsGROUP:
+            cg_cm = fr->cg_cm;
+            break;
+        case ecutsVERLET:
+            cg_cm = state->x;
+            break;
+        default:
+            gmx_incons("unimplemented");
+            cg_cm = NULL;
     }
 
-    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
+    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
     {
         dim = dd->dim[dim_ind];
 
         /* Check if we need to use triclinic distances */
         tric_dist[dim_ind] = 0;
-        for(i=0; i<=dim_ind; i++)
+        for (i = 0; i <= dim_ind; i++)
         {
             if (ddbox->tric_dir[dd->dim[i]])
             {
@@ -7875,7 +8142,7 @@ static void setup_dd_communication(gmx_domdec_t *dd,
 
     /* Do we need to determine extra distances for multi-body bondeds? */
     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
-    
+
     /* Do we need to determine extra distances for only two-body bondeds? */
     bDist2B = (bBondComm && !bDistMB);
 
@@ -7884,19 +8151,19 @@ static void setup_dd_communication(gmx_domdec_t *dd,
 
     if (debug)
     {
-        fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
+        fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
     }
 
     zones = &comm->zones;
-    
+
     dim0 = dd->dim[0];
     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
 
-    set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
-    
+    set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
+
     /* Triclinic stuff */
-    normal = ddbox->normal;
+    normal      = ddbox->normal;
     skew_fac_01 = 0;
     if (dd->ndim >= 2)
     {
@@ -7911,7 +8178,7 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
             if (debug)
             {
-                fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
+                fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
             }
         }
     }
@@ -7919,24 +8186,24 @@ static void setup_dd_communication(gmx_domdec_t *dd,
     {
         v_1 = ddbox->v[dim1];
     }
-    
+
     zone_cg_range = zones->cg_range;
-    index_gl = dd->index_gl;
-    cgindex  = dd->cgindex;
-    cginfo_mb = fr->cginfo_mb;
-    
+    index_gl      = dd->index_gl;
+    cgindex       = dd->cgindex;
+    cginfo_mb     = fr->cginfo_mb;
+
     zone_cg_range[0]   = 0;
     zone_cg_range[1]   = dd->ncg_home;
     comm->zone_ncg1[0] = dd->ncg_home;
     pos_cg             = dd->ncg_home;
-    
+
     nat_tot = dd->nat_home;
-    nzone = 1;
-    for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
+    nzone   = 1;
+    for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
     {
         dim = dd->dim[dim_ind];
-        cd = &comm->cd[dim_ind];
-        
+        cd  = &comm->cd[dim_ind];
+
         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
         {
             /* No pbc in this dimension, the first node should not comm. */
@@ -7947,21 +8214,21 @@ static void setup_dd_communication(gmx_domdec_t *dd,
             nzone_send = nzone;
         }
 
-        v_d = ddbox->v[dim];
+        v_d         = ddbox->v[dim];
         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
 
         cd->bInPlace = TRUE;
-        for(p=0; p<cd->np; p++)
+        for (p = 0; p < cd->np; p++)
         {
             /* Only atoms communicated in the first pulse are used
              * for multi-body bonded interactions or for bBondComm.
              */
             bDistBonded = ((bDistMB || bDist2B) && p == 0);
 
-            ind = &cd->ind[p];
+            ind   = &cd->ind[p];
             nsend = 0;
-            nat = 0;
-            for(zone=0; zone<nzone_send; zone++)
+            nat   = 0;
+            for (zone = 0; zone < nzone_send; zone++)
             {
                 if (tric_dist[dim_ind] && dim_ind > 0)
                 {
@@ -7970,12 +8237,12 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                      * This reduces the number of communicated atoms
                      * by about 10% for 3D DD of rhombic dodecahedra.
                      */
-                    for(dimd=0; dimd<dim; dimd++)
+                    for (dimd = 0; dimd < dim; dimd++)
                     {
                         sf2_round[dimd] = 1;
                         if (ddbox->tric_dir[dimd])
                         {
-                            for(i=dd->dim[dimd]+1; i<DIM; i++)
+                            for (i = dd->dim[dimd]+1; i < DIM; i++)
                             {
                                 /* If we are shifted in dimension i
                                  * and the cell plane is tilted forward
@@ -8011,14 +8278,14 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                 }
 
 #pragma omp parallel for num_threads(comm->nth) schedule(static)
-                for(th=0; th<comm->nth; th++)
+                for (th = 0; th < comm->nth; th++)
                 {
                     gmx_domdec_ind_t *ind_p;
-                    int **ibuf_p,*ibuf_nalloc_p;
-                    vec_rvec_t *vbuf_p;
-                    int *nsend_p,*nat_p;
-                    int *nsend_zone_p;
-                    int cg0_th,cg1_th;
+                    int             **ibuf_p, *ibuf_nalloc_p;
+                    vec_rvec_t       *vbuf_p;
+                    int              *nsend_p, *nat_p;
+                    int              *nsend_zone_p;
+                    int               cg0_th, cg1_th;
 
                     if (th == 0)
                     {
@@ -8057,30 +8324,30 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
                     }
-                    
+
                     /* Get the cg's for this pulse in this zone */
-                    get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
-                                       index_gl,cgindex,
-                                       dim,dim_ind,dim0,dim1,dim2,
-                                       r_comm2,r_bcomm2,
-                                       box,tric_dist,
-                                       normal,skew_fac2_d,skew_fac_01,
-                                       v_d,v_0,v_1,&corners,sf2_round,
-                                       bDistBonded,bBondComm,
-                                       bDist2B,bDistMB,
-                                       cg_cm,fr->cginfo,
+                    get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
+                                       index_gl, cgindex,
+                                       dim, dim_ind, dim0, dim1, dim2,
+                                       r_comm2, r_bcomm2,
+                                       box, tric_dist,
+                                       normal, skew_fac2_d, skew_fac_01,
+                                       v_d, v_0, v_1, &corners, sf2_round,
+                                       bDistBonded, bBondComm,
+                                       bDist2B, bDistMB,
+                                       cg_cm, fr->cginfo,
                                        ind_p,
-                                       ibuf_p,ibuf_nalloc_p,
+                                       ibuf_p, ibuf_nalloc_p,
                                        vbuf_p,
-                                       nsend_p,nat_p,
+                                       nsend_p, nat_p,
                                        nsend_zone_p);
                 }
 
                 /* Append data of threads>=1 to the communication buffers */
-                for(th=1; th<comm->nth; th++)
+                for (th = 1; th < comm->nth; th++)
                 {
                     dd_comm_setup_work_t *dth;
-                    int i,ns1;
+                    int                   i, ns1;
 
                     dth = &comm->dth[th];
 
@@ -8088,22 +8355,22 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                     if (ns1 > ind->nalloc)
                     {
                         ind->nalloc = over_alloc_dd(ns1);
-                        srenew(ind->index,ind->nalloc);
+                        srenew(ind->index, ind->nalloc);
                     }
                     if (ns1 > comm->nalloc_int)
                     {
                         comm->nalloc_int = over_alloc_dd(ns1);
-                        srenew(comm->buf_int,comm->nalloc_int);
+                        srenew(comm->buf_int, comm->nalloc_int);
                     }
                     if (ns1 > comm->vbuf.nalloc)
                     {
                         comm->vbuf.nalloc = over_alloc_dd(ns1);
-                        srenew(comm->vbuf.v,comm->vbuf.nalloc);
+                        srenew(comm->vbuf.v, comm->vbuf.nalloc);
                     }
 
-                    for(i=0; i<dth->nsend_zone; i++)
+                    for (i = 0; i < dth->nsend_zone; i++)
                     {
-                        ind->index[nsend] = dth->ind.index[i];
+                        ind->index[nsend]    = dth->ind.index[i];
                         comm->buf_int[nsend] = dth->ibuf[i];
                         copy_rvec(dth->vbuf.v[i],
                                   comm->vbuf.v[nsend]);
@@ -8114,7 +8381,7 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                 }
             }
             /* Clear the counts in case we do not have pbc */
-            for(zone=nzone_send; zone<nzone; zone++)
+            for (zone = nzone_send; zone < nzone; zone++)
             {
                 ind->nsend[zone] = 0;
             }
@@ -8124,16 +8391,16 @@ static void setup_dd_communication(gmx_domdec_t *dd,
             dd_sendrecv_int(dd, dim_ind, dddirBackward,
                             ind->nsend, nzone+2,
                             ind->nrecv, nzone+2);
-            
+
             /* The rvec buffer is also required for atom buffers of size nsend
              * in dd_move_x and dd_move_f.
              */
-            vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
+            vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
 
             if (p > 0)
             {
                 /* We can receive in place if only the last zone is not empty */
-                for(zone=0; zone<nzone-1; zone++)
+                for (zone = 0; zone < nzone-1; zone++)
                 {
                     if (ind->nrecv[zone] > 0)
                     {
@@ -8146,23 +8413,23 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                     if (ind->nrecv[nzone] > comm->nalloc_int2)
                     {
                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
-                        srenew(comm->buf_int2,comm->nalloc_int2);
+                        srenew(comm->buf_int2, comm->nalloc_int2);
                     }
                     /* The rvec buffer is also required for atom buffers
                      * of size nrecv in dd_move_x and dd_move_f.
                      */
-                    i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
-                    vec_rvec_check_alloc(&comm->vbuf2,i);
+                    i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
+                    vec_rvec_check_alloc(&comm->vbuf2, i);
                 }
             }
-            
+
             /* Make space for the global cg indices */
             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
                 || dd->cg_nalloc == 0)
             {
                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
-                srenew(index_gl,dd->cg_nalloc);
-                srenew(cgindex,dd->cg_nalloc+1);
+                srenew(index_gl, dd->cg_nalloc);
+                srenew(cgindex, dd->cg_nalloc+1);
             }
             /* Communicate the global cg indices */
             if (cd->bInPlace)
@@ -8178,7 +8445,7 @@ static void setup_dd_communication(gmx_domdec_t *dd,
                             recv_i,        ind->nrecv[nzone]);
 
             /* Make space for cg_cm */
-            dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
+            dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
             if (fr->cutoff_scheme == ecutsGROUP)
             {
                 cg_cm = fr->cg_cm;
@@ -8199,19 +8466,19 @@ static void setup_dd_communication(gmx_domdec_t *dd,
             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
                              comm->vbuf.v, nsend,
                              recv_vr,      ind->nrecv[nzone]);
-            
+
             /* Make the charge group index */
             if (cd->bInPlace)
             {
                 zone = (p == 0 ? 0 : nzone - 1);
                 while (zone < nzone)
                 {
-                    for(cg=0; cg<ind->nrecv[zone]; cg++)
+                    for (cg = 0; cg < ind->nrecv[zone]; cg++)
                     {
-                        cg_gl = index_gl[pos_cg];
-                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
-                        nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
-                        cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
+                        cg_gl              = index_gl[pos_cg];
+                        fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
+                        nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
+                        cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
                         if (bBondComm)
                         {
                             /* Update the charge group presence,
@@ -8232,9 +8499,9 @@ static void setup_dd_communication(gmx_domdec_t *dd,
             else
             {
                 /* This part of the code is never executed with bBondComm. */
-                merge_cg_buffers(nzone,cd,p,zone_cg_range,
-                                 index_gl,recv_i,cg_cm,recv_vr,
-                                 cgindex,fr->cginfo_mb,fr->cginfo);
+                merge_cg_buffers(nzone, cd, p, zone_cg_range,
+                                 index_gl, recv_i, cg_cm, recv_vr,
+                                 cgindex, fr->cginfo_mb, fr->cginfo);
                 pos_cg += ind->nrecv[nzone];
             }
             nat_tot += ind->nrecv[nzone+1];
@@ -8242,17 +8509,17 @@ static void setup_dd_communication(gmx_domdec_t *dd,
         if (!cd->bInPlace)
         {
             /* Store the atom block for easy copying of communication buffers */
-            make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
+            make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
         }
         nzone += nzone;
     }
     dd->index_gl = index_gl;
     dd->cgindex  = cgindex;
-    
-    dd->ncg_tot = zone_cg_range[zones->n];
-    dd->nat_tot = nat_tot;
+
+    dd->ncg_tot          = zone_cg_range[zones->n];
+    dd->nat_tot          = nat_tot;
     comm->nat[ddnatHOME] = dd->nat_home;
-    for(i=ddnatZONE; i<ddnatNR; i++)
+    for (i = ddnatZONE; i < ddnatNR; i++)
     {
         comm->nat[i] = dd->nat_tot;
     }
@@ -8262,26 +8529,26 @@ static void setup_dd_communication(gmx_domdec_t *dd,
         /* We don't need to update cginfo, since that was alrady done above.
          * So we pass NULL for the forcerec.
          */
-        dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
-                      NULL,comm->bLocalCG);
+        dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
+                      NULL, comm->bLocalCG);
     }
 
     if (debug)
     {
-        fprintf(debug,"Finished setting up DD communication, zones:");
-        for(c=0; c<zones->n; c++)
+        fprintf(debug, "Finished setting up DD communication, zones:");
+        for (c = 0; c < zones->n; c++)
         {
-            fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
+            fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
         }
-        fprintf(debug,"\n");
+        fprintf(debug, "\n");
     }
 }
 
 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 {
     int c;
-    
-    for(c=0; c<zones->nizone; c++)
+
+    for (c = 0; c < zones->nizone; c++)
     {
         zones->izone[c].cg1  = zones->cg_range[c+1];
         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
@@ -8290,17 +8557,17 @@ static void set_cg_boundaries(gmx_domdec_zones_t *zones)
 }
 
 static void set_zones_size(gmx_domdec_t *dd,
-                           matrix box,const gmx_ddbox_t *ddbox,
-                           int zone_start,int zone_end)
+                           matrix box, const gmx_ddbox_t *ddbox,
+                           int zone_start, int zone_end)
 {
-    gmx_domdec_comm_t *comm;
+    gmx_domdec_comm_t  *comm;
     gmx_domdec_zones_t *zones;
-    gmx_bool bDistMB;
-    int  z,zi,zj0,zj1,d,dim;
-    real rcs,rcmbs;
-    int  i,j;
-    real size_j,add_tric;
-    real vol;
+    gmx_bool            bDistMB;
+    int                 z, zi, zj0, zj1, d, dim;
+    real                rcs, rcmbs;
+    int                 i, j;
+    real                size_j, add_tric;
+    real                vol;
 
     comm = dd->comm;
 
@@ -8309,20 +8576,20 @@ static void set_zones_size(gmx_domdec_t *dd,
     /* Do we need to determine extra distances for multi-body bondeds? */
     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
 
-    for(z=zone_start; z<zone_end; z++)
+    for (z = zone_start; z < zone_end; z++)
     {
         /* Copy cell limits to zone limits.
          * Valid for non-DD dims and non-shifted dims.
          */
-        copy_rvec(comm->cell_x0,zones->size[z].x0);
-        copy_rvec(comm->cell_x1,zones->size[z].x1);
+        copy_rvec(comm->cell_x0, zones->size[z].x0);
+        copy_rvec(comm->cell_x1, zones->size[z].x1);
     }
 
-    for(d=0; d<dd->ndim; d++)
+    for (d = 0; d < dd->ndim; d++)
     {
         dim = dd->dim[d];
 
-        for(z=0; z<zones->n; z++)
+        for (z = 0; z < zones->n; z++)
         {
             /* With a staggered grid we have different sizes
              * for non-shifted dimensions.
@@ -8351,7 +8618,7 @@ static void set_zones_size(gmx_domdec_t *dd,
         }
 
         /* Set the lower limit for the shifted zone dimensions */
-        for(z=zone_start; z<zone_end; z++)
+        for (z = zone_start; z < zone_end; z++)
         {
             if (zones->shift[z][dim] > 0)
             {
@@ -8369,7 +8636,7 @@ static void set_zones_size(gmx_domdec_t *dd,
                     if (z < 4)
                     {
                         zones->size[z].x0[dim] =
-                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
+                            comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
                     }
                     else
                     {
@@ -8389,7 +8656,7 @@ static void set_zones_size(gmx_domdec_t *dd,
 
                     if (bDistMB)
                     {
-                        for(zi=0; zi<zones->nizone; zi++)
+                        for (zi = 0; zi < zones->nizone; zi++)
                         {
                             if (zones->shift[zi][dim] == 0)
                             {
@@ -8409,11 +8676,11 @@ static void set_zones_size(gmx_domdec_t *dd,
         /* Loop over the i-zones to set the upper limit of each
          * j-zone they see.
          */
-        for(zi=0; zi<zones->nizone; zi++)
+        for (zi = 0; zi < zones->nizone; zi++)
         {
             if (zones->shift[zi][dim] == 0)
             {
-                for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
+                for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
                 {
                     if (zones->shift[z][dim] > 0)
                     {
@@ -8425,47 +8692,83 @@ static void set_zones_size(gmx_domdec_t *dd,
         }
     }
 
-    for(z=zone_start; z<zone_end; z++)
+    for (z = zone_start; z < zone_end; z++)
     {
-        for(i=0; i<DIM; i++)
-        {
-            zones->size[z].bb_x0[i] = zones->size[z].x0[i];
-            zones->size[z].bb_x1[i] = zones->size[z].x1[i];
+        /* Initialization only required to keep the compiler happy */
+        rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
+        int  nc, c;
+
+        /* To determine the bounding box for a zone we need to find
+         * the extreme corners of 4, 2 or 1 corners.
+         */
+        nc = 1 << (ddbox->npbcdim - 1);
 
-            for(j=i+1; j<ddbox->npbcdim; j++)
+        for (c = 0; c < nc; c++)
+        {
+            /* Set up a zone corner at x=0, ignoring trilinic couplings */
+            corner[XX] = 0;
+            if ((c & 1) == 0)
+            {
+                corner[YY] = zones->size[z].x0[YY];
+            }
+            else
+            {
+                corner[YY] = zones->size[z].x1[YY];
+            }
+            if ((c & 2) == 0)
+            {
+                corner[ZZ] = zones->size[z].x0[ZZ];
+            }
+            else
+            {
+                corner[ZZ] = zones->size[z].x1[ZZ];
+            }
+            if (dd->ndim == 1 && box[ZZ][YY] != 0)
             {
                 /* With 1D domain decomposition the cg's are not in
-                 * the triclinic box, but trilinic x-y and rectangular y-z.
+                 * the triclinic box, but triclinic x-y and rectangular y-z.
+                 * Shift y back, so it will later end up at 0.
                  */
-                if (box[j][i] != 0 &&
-                    !(dd->ndim == 1 && i == YY && j == ZZ))
+                corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
+            }
+            /* Apply the triclinic couplings */
+            assert(ddbox->npbcdim <= DIM);
+            for (i = YY; i < ddbox->npbcdim; i++)
+            {
+                for (j = XX; j < i; j++)
                 {
-                    /* Correct for triclinic offset of the lower corner */
-                    add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
-                    zones->size[z].bb_x0[i] += add_tric;
-                    zones->size[z].bb_x1[i] += add_tric;
-
-                    /* Correct for triclinic offset of the upper corner */
-                    size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
-                    add_tric = size_j*box[j][i]/box[j][j];
-
-                    if (box[j][i] < 0)
-                    {
-                        zones->size[z].bb_x0[i] += add_tric;
-                    }
-                    else
-                    {
-                        zones->size[z].bb_x1[i] += add_tric;
-                    }
+                    corner[j] += corner[i]*box[i][j]/box[i][i];
+                }
+            }
+            if (c == 0)
+            {
+                copy_rvec(corner, corner_min);
+                copy_rvec(corner, corner_max);
+            }
+            else
+            {
+                for (i = 0; i < DIM; i++)
+                {
+                    corner_min[i] = min(corner_min[i], corner[i]);
+                    corner_max[i] = max(corner_max[i], corner[i]);
                 }
             }
         }
+        /* Copy the extreme cornes without offset along x */
+        for (i = 0; i < DIM; i++)
+        {
+            zones->size[z].bb_x0[i] = corner_min[i];
+            zones->size[z].bb_x1[i] = corner_max[i];
+        }
+        /* Add the offset along x */
+        zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
+        zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
     }
 
     if (zone_start == 0)
     {
         vol = 1;
-        for(dim=0; dim<DIM; dim++)
+        for (dim = 0; dim < DIM; dim++)
         {
             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
         }
@@ -8474,123 +8777,123 @@ static void set_zones_size(gmx_domdec_t *dd,
 
     if (debug)
     {
-        for(z=zone_start; z<zone_end; z++)
+        for (z = zone_start; z < zone_end; z++)
         {
-            fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
+            fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
                     z,
-                    zones->size[z].x0[XX],zones->size[z].x1[XX],
-                    zones->size[z].x0[YY],zones->size[z].x1[YY],
-                    zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
-            fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
+                    zones->size[z].x0[XX], zones->size[z].x1[XX],
+                    zones->size[z].x0[YY], zones->size[z].x1[YY],
+                    zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
+            fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
                     z,
-                    zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
-                    zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
-                    zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
+                    zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
+                    zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
+                    zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
         }
     }
 }
 
-static int comp_cgsort(const void *a,const void *b)
+static int comp_cgsort(const void *a, const void *b)
 {
-    int comp;
-    
-    gmx_cgsort_t *cga,*cgb;
+    int           comp;
+
+    gmx_cgsort_t *cga, *cgb;
     cga = (gmx_cgsort_t *)a;
     cgb = (gmx_cgsort_t *)b;
-    
+
     comp = cga->nsc - cgb->nsc;
     if (comp == 0)
     {
         comp = cga->ind_gl - cgb->ind_gl;
     }
-    
+
     return comp;
 }
 
-static void order_int_cg(int n,const gmx_cgsort_t *sort,
-                         int *a,int *buf)
+static void order_int_cg(int n, const gmx_cgsort_t *sort,
+                         int *a, int *buf)
 {
     int i;
-    
+
     /* Order the data */
-    for(i=0; i<n; i++)
+    for (i = 0; i < n; i++)
     {
         buf[i] = a[sort[i].ind];
     }
-    
+
     /* Copy back to the original array */
-    for(i=0; i<n; i++)
+    for (i = 0; i < n; i++)
     {
         a[i] = buf[i];
     }
 }
 
-static void order_vec_cg(int n,const gmx_cgsort_t *sort,
-                         rvec *v,rvec *buf)
+static void order_vec_cg(int n, const gmx_cgsort_t *sort,
+                         rvec *v, rvec *buf)
 {
     int i;
-    
+
     /* Order the data */
-    for(i=0; i<n; i++)
+    for (i = 0; i < n; i++)
     {
-        copy_rvec(v[sort[i].ind],buf[i]);
+        copy_rvec(v[sort[i].ind], buf[i]);
     }
-    
+
     /* Copy back to the original array */
-    for(i=0; i<n; i++)
+    for (i = 0; i < n; i++)
     {
-        copy_rvec(buf[i],v[i]);
+        copy_rvec(buf[i], v[i]);
     }
 }
 
-static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
-                           rvec *v,rvec *buf)
+static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
+                           rvec *v, rvec *buf)
 {
-    int a,atot,cg,cg0,cg1,i;
-    
+    int a, atot, cg, cg0, cg1, i;
+
     if (cgindex == NULL)
     {
         /* Avoid the useless loop of the atoms within a cg */
-        order_vec_cg(ncg,sort,v,buf);
+        order_vec_cg(ncg, sort, v, buf);
 
         return;
     }
 
     /* Order the data */
     a = 0;
-    for(cg=0; cg<ncg; cg++)
+    for (cg = 0; cg < ncg; cg++)
     {
         cg0 = cgindex[sort[cg].ind];
         cg1 = cgindex[sort[cg].ind+1];
-        for(i=cg0; i<cg1; i++)
+        for (i = cg0; i < cg1; i++)
         {
-            copy_rvec(v[i],buf[a]);
+            copy_rvec(v[i], buf[a]);
             a++;
         }
     }
     atot = a;
-    
+
     /* Copy back to the original array */
-    for(a=0; a<atot; a++)
+    for (a = 0; a < atot; a++)
     {
-        copy_rvec(buf[a],v[a]);
+        copy_rvec(buf[a], v[a]);
     }
 }
 
-static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
-                         int nsort_new,gmx_cgsort_t *sort_new,
+static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
+                         int nsort_new, gmx_cgsort_t *sort_new,
                          gmx_cgsort_t *sort1)
 {
-    int i1,i2,i_new;
-    
+    int i1, i2, i_new;
+
     /* The new indices are not very ordered, so we qsort them */
-    qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
-    
+    gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
+
     /* sort2 is already ordered, so now we can merge the two arrays */
-    i1 = 0;
-    i2 = 0;
+    i1    = 0;
+    i2    = 0;
     i_new = 0;
-    while(i2 < nsort2 || i_new < nsort_new)
+    while (i2 < nsort2 || i_new < nsort_new)
     {
         if (i2 == nsort2)
         {
@@ -8613,12 +8916,12 @@ static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
     }
 }
 
-static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
+static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
 {
     gmx_domdec_sort_t *sort;
-    gmx_cgsort_t *cgsort,*sort_i;
-    int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
-    int  sort_last,sort_skip;
+    gmx_cgsort_t      *cgsort, *sort_i;
+    int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
+    int                sort_last, sort_skip;
 
     sort = dd->comm->sort;
 
@@ -8632,10 +8935,10 @@ static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
          * are completely ordered. So we can sort efficiently by sorting
          * the charge groups that did move into the stationary list.
          */
-        ncg_new = 0;
-        nsort2 = 0;
+        ncg_new   = 0;
+        nsort2    = 0;
         nsort_new = 0;
-        for(i=0; i<dd->ncg_home; i++)
+        for (i = 0; i < dd->ncg_home; i++)
         {
             /* Check if this cg did not move to another node */
             if (a[i] < moved)
@@ -8646,7 +8949,7 @@ static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
                     if (nsort_new >= sort->sort_new_nalloc)
                     {
                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
-                        srenew(sort->sort_new,sort->sort_new_nalloc);
+                        srenew(sort->sort_new, sort->sort_new_nalloc);
                     }
                     sort_i = &(sort->sort_new[nsort_new++]);
                 }
@@ -8668,18 +8971,18 @@ static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
         }
         if (debug)
         {
-            fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
-                    nsort2,nsort_new);
+            fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
+                    nsort2, nsort_new);
         }
         /* Sort efficiently */
-        ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
+        ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
                      sort->sort);
     }
     else
     {
-        cgsort = sort->sort;
+        cgsort  = sort->sort;
         ncg_new = 0;
-        for(i=0; i<dd->ncg_home; i++)
+        for (i = 0; i < dd->ncg_home; i++)
         {
             /* Sort on the ns grid cell indices
              * and the global topology index
@@ -8694,26 +8997,26 @@ static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
         }
         if (debug)
         {
-            fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
+            fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
         }
         /* Determine the order of the charge groups using qsort */
-        qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
+        gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
     }
 
     return ncg_new;
 }
 
-static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
+static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
 {
     gmx_cgsort_t *sort;
-    int  ncg_new,i,*a,na;
+    int           ncg_new, i, *a, na;
 
     sort = dd->comm->sort->sort;
 
-    nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
+    nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
 
     ncg_new = 0;
-    for(i=0; i<na; i++)
+    for (i = 0; i < na; i++)
     {
         if (a[i] >= 0)
         {
@@ -8725,43 +9028,42 @@ static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
     return ncg_new;
 }
 
-static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
-                          rvec *cgcm,t_forcerec *fr,t_state *state,
+static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
                           int ncg_home_old)
 {
     gmx_domdec_sort_t *sort;
-    gmx_cgsort_t *cgsort,*sort_i;
-    int  *cgindex;
-    int  ncg_new,i,*ibuf,cgsize;
-    rvec *vbuf;
-    
+    gmx_cgsort_t      *cgsort, *sort_i;
+    int               *cgindex;
+    int                ncg_new, i, *ibuf, cgsize;
+    rvec              *vbuf;
+
     sort = dd->comm->sort;
-    
+
     if (dd->ncg_home > sort->sort_nalloc)
     {
         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
-        srenew(sort->sort,sort->sort_nalloc);
-        srenew(sort->sort2,sort->sort_nalloc);
+        srenew(sort->sort, sort->sort_nalloc);
+        srenew(sort->sort2, sort->sort_nalloc);
     }
     cgsort = sort->sort;
 
     switch (fr->cutoff_scheme)
     {
-    case ecutsGROUP:
-        ncg_new = dd_sort_order(dd,fr,ncg_home_old);
-        break;
-    case ecutsVERLET:
-        ncg_new = dd_sort_order_nbnxn(dd,fr);
-        break;
-    default:
-        gmx_incons("unimplemented");
-        ncg_new = 0;
+        case ecutsGROUP:
+            ncg_new = dd_sort_order(dd, fr, ncg_home_old);
+            break;
+        case ecutsVERLET:
+            ncg_new = dd_sort_order_nbnxn(dd, fr);
+            break;
+        default:
+            gmx_incons("unimplemented");
+            ncg_new = 0;
     }
 
     /* We alloc with the old size, since cgindex is still old */
-    vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
+    vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
     vbuf = dd->comm->vbuf.v;
-    
+
     if (dd->comm->bCGs)
     {
         cgindex = dd->cgindex;
@@ -8775,76 +9077,76 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
     dd->ncg_home = ncg_new;
     if (debug)
     {
-        fprintf(debug,"Set the new home charge group count to %d\n",
+        fprintf(debug, "Set the new home charge group count to %d\n",
                 dd->ncg_home);
     }
-    
+
     /* Reorder the state */
-    for(i=0; i<estNR; i++)
+    for (i = 0; i < estNR; i++)
     {
         if (EST_DISTR(i) && (state->flags & (1<<i)))
         {
             switch (i)
             {
-            case estX:
-                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
-                break;
-            case estV:
-                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
-                break;
-            case estSDX:
-                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
-                break;
-            case estCGP:
-                order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
-                break;
-            case estLD_RNG:
-            case estLD_RNGI:
-            case estDISRE_INITF:
-            case estDISRE_RM3TAV:
-            case estORIRE_INITF:
-            case estORIRE_DTAV:
-                /* No ordering required */
-                break;
-            default:
-                gmx_incons("Unknown state entry encountered in dd_sort_state");
-                break;
+                case estX:
+                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
+                    break;
+                case estV:
+                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
+                    break;
+                case estSDX:
+                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
+                    break;
+                case estCGP:
+                    order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
+                    break;
+                case estLD_RNG:
+                case estLD_RNGI:
+                case estDISRE_INITF:
+                case estDISRE_RM3TAV:
+                case estORIRE_INITF:
+                case estORIRE_DTAV:
+                    /* No ordering required */
+                    break;
+                default:
+                    gmx_incons("Unknown state entry encountered in dd_sort_state");
+                    break;
             }
         }
     }
     if (fr->cutoff_scheme == ecutsGROUP)
     {
         /* Reorder cgcm */
-        order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
+        order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
     }
-    
+
     if (dd->ncg_home+1 > sort->ibuf_nalloc)
     {
         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
-        srenew(sort->ibuf,sort->ibuf_nalloc);
+        srenew(sort->ibuf, sort->ibuf_nalloc);
     }
     ibuf = sort->ibuf;
     /* Reorder the global cg index */
-    order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
+    order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
     /* Reorder the cginfo */
-    order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
+    order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
     /* Rebuild the local cg index */
     if (dd->comm->bCGs)
     {
         ibuf[0] = 0;
-        for(i=0; i<dd->ncg_home; i++)
+        for (i = 0; i < dd->ncg_home; i++)
         {
-            cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
+            cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
             ibuf[i+1] = ibuf[i] + cgsize;
         }
-        for(i=0; i<dd->ncg_home+1; i++)
+        for (i = 0; i < dd->ncg_home+1; i++)
         {
             dd->cgindex[i] = ibuf[i];
         }
     }
     else
     {
-        for(i=0; i<dd->ncg_home+1; i++)
+        for (i = 0; i < dd->ncg_home+1; i++)
         {
             dd->cgindex[i] = i;
         }
@@ -8860,7 +9162,7 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
     else
     {
         /* Copy the sorted ns cell indices back to the ns grid struct */
-        for(i=0; i<dd->ncg_home; i++)
+        for (i = 0; i < dd->ncg_home; i++)
         {
             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
         }
@@ -8871,11 +9173,11 @@ static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
 static void add_dd_statistics(gmx_domdec_t *dd)
 {
     gmx_domdec_comm_t *comm;
-    int ddnat;
-    
+    int                ddnat;
+
     comm = dd->comm;
-    
-    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
+
+    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
     {
         comm->sum_nat[ddnat-ddnatZONE] +=
             comm->nat[ddnat] - comm->nat[ddnat-1];
@@ -8886,115 +9188,115 @@ static void add_dd_statistics(gmx_domdec_t *dd)
 void reset_dd_statistics_counters(gmx_domdec_t *dd)
 {
     gmx_domdec_comm_t *comm;
-    int ddnat;
-    
+    int                ddnat;
+
     comm = dd->comm;
 
     /* Reset all the statistics and counters for total run counting */
-    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
+    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
     {
         comm->sum_nat[ddnat-ddnatZONE] = 0;
     }
-    comm->ndecomp = 0;
-    comm->nload = 0;
+    comm->ndecomp   = 0;
+    comm->nload     = 0;
     comm->load_step = 0;
-    comm->load_sum = 0;
-    comm->load_max = 0;
+    comm->load_sum  = 0;
+    comm->load_max  = 0;
     clear_ivec(comm->load_lim);
     comm->load_mdf = 0;
     comm->load_pme = 0;
 }
 
-void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
+void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
 {
     gmx_domdec_comm_t *comm;
-    int ddnat;
-    double av;
-   
+    int                ddnat;
+    double             av;
+
     comm = cr->dd->comm;
-    
-    gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
-    
+
+    gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
+
     if (fplog == NULL)
     {
         return;
     }
-    
-    fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
-            
-    for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
+
+    fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
+
+    for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
     {
         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
-        switch(ddnat)
+        switch (ddnat)
         {
-        case ddnatZONE:
-            fprintf(fplog,
-                    " av. #atoms communicated per step for force:  %d x %.1f\n",
-                    2,av);
-            break;
-        case ddnatVSITE:
-            if (cr->dd->vsite_comm)
-            {
-                fprintf(fplog,
-                        " av. #atoms communicated per step for vsites: %d x %.1f\n",
-                        (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
-                        av);
-            }
-            break;
-        case ddnatCON:
-            if (cr->dd->constraint_comm)
-            {
+            case ddnatZONE:
                 fprintf(fplog,
-                        " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
-                        1 + ir->nLincsIter,av);
-            }
-            break;
-        default:
-            gmx_incons(" Unknown type for DD statistics");
+                        " av. #atoms communicated per step for force:  %d x %.1f\n",
+                        2, av);
+                break;
+            case ddnatVSITE:
+                if (cr->dd->vsite_comm)
+                {
+                    fprintf(fplog,
+                            " av. #atoms communicated per step for vsites: %d x %.1f\n",
+                            (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
+                            av);
+                }
+                break;
+            case ddnatCON:
+                if (cr->dd->constraint_comm)
+                {
+                    fprintf(fplog,
+                            " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
+                            1 + ir->nLincsIter, av);
+                }
+                break;
+            default:
+                gmx_incons(" Unknown type for DD statistics");
         }
     }
-    fprintf(fplog,"\n");
-    
+    fprintf(fplog, "\n");
+
     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
     {
-        print_dd_load_av(fplog,cr->dd);
-    }
-}
-
-void dd_partition_system(FILE            *fplog,
-                         gmx_large_int_t      step,
-                         t_commrec       *cr,
-                         gmx_bool            bMasterState,
-                         int             nstglobalcomm,
-                         t_state         *state_global,
-                         gmx_mtop_t      *top_global,
-                         t_inputrec      *ir,
-                         t_state         *state_local,
-                         rvec            **f,
-                         t_mdatoms       *mdatoms,
-                         gmx_localtop_t  *top_local,
-                         t_forcerec      *fr,
-                         gmx_vsite_t     *vsite,
-                         gmx_shellfc_t   shellfc,
-                         gmx_constr_t    constr,
-                         t_nrnb          *nrnb,
-                         gmx_wallcycle_t wcycle,
-                         gmx_bool            bVerbose)
-{
-    gmx_domdec_t *dd;
+        print_dd_load_av(fplog, cr->dd);
+    }
+}
+
+void dd_partition_system(FILE                *fplog,
+                         gmx_int64_t          step,
+                         t_commrec           *cr,
+                         gmx_bool             bMasterState,
+                         int                  nstglobalcomm,
+                         t_state             *state_global,
+                         gmx_mtop_t          *top_global,
+                         t_inputrec          *ir,
+                         t_state             *state_local,
+                         rvec               **f,
+                         t_mdatoms           *mdatoms,
+                         gmx_localtop_t      *top_local,
+                         t_forcerec          *fr,
+                         gmx_vsite_t         *vsite,
+                         gmx_shellfc_t        shellfc,
+                         gmx_constr_t         constr,
+                         t_nrnb              *nrnb,
+                         gmx_wallcycle_t      wcycle,
+                         gmx_bool             bVerbose)
+{
+    gmx_domdec_t      *dd;
     gmx_domdec_comm_t *comm;
-    gmx_ddbox_t ddbox={0};
-    t_block *cgs_gl;
-    gmx_large_int_t step_pcoupl;
-    rvec cell_ns_x0,cell_ns_x1;
-    int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
-    gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
-    gmx_bool bRedist,bSortCG,bResortAll;
-    ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
-    real grid_density;
-    char sbuf[22];
-       
-    dd = cr->dd;
+    gmx_ddbox_t        ddbox = {0};
+    t_block           *cgs_gl;
+    gmx_int64_t        step_pcoupl;
+    rvec               cell_ns_x0, cell_ns_x1;
+    int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
+    gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
+    gmx_bool           bRedist, bSortCG, bResortAll;
+    ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
+    real               grid_density;
+    char               sbuf[22];
+
+    dd   = cr->dd;
     comm = dd->comm;
 
     bBoxChanged = (bMasterState || DEFORM(*ir));
@@ -9055,7 +9357,7 @@ void dd_partition_system(FILE            *fplog,
              * and every 100 partitionings,
              * so the extra communication cost is negligible.
              */
-            n = max(100,nstglobalcomm);
+            n         = max(100, nstglobalcomm);
             bCheckDLB = (comm->n_load_collect == 0 ||
                          comm->n_load_have % n == n-1);
         }
@@ -9063,7 +9365,7 @@ void dd_partition_system(FILE            *fplog,
         {
             bCheckDLB = FALSE;
         }
-        
+
         /* Print load every nstlog, first and last step to the log file */
         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
                     comm->n_load_collect == 0 ||
@@ -9076,12 +9378,12 @@ void dd_partition_system(FILE            *fplog,
         if (bDoDLB || bLogLoad || bCheckDLB ||
             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
         {
-            get_load_distribution(dd,wcycle);
+            get_load_distribution(dd, wcycle);
             if (DDMASTER(dd))
             {
                 if (bLogLoad)
                 {
-                    dd_print_load(fplog,dd,step-1);
+                    dd_print_load(fplog, dd, step-1);
                 }
                 if (bVerbose)
                 {
@@ -9090,23 +9392,24 @@ void dd_partition_system(FILE            *fplog,
             }
             comm->n_load_collect++;
 
-            if (bCheckDLB) {
+            if (bCheckDLB)
+            {
                 /* Since the timings are node dependent, the master decides */
                 if (DDMASTER(dd))
                 {
                     bTurnOnDLB =
-                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
+                        (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
                     if (debug)
                     {
-                        fprintf(debug,"step %s, imb loss %f\n",
-                                gmx_step_str(step,sbuf),
+                        fprintf(debug, "step %s, imb loss %f\n",
+                                gmx_step_str(step, sbuf),
                                 dd_force_imb_perf_loss(dd));
                     }
                 }
-                dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
+                dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
                 if (bTurnOnDLB)
                 {
-                    turn_on_dlb(fplog,cr,step);
+                    turn_on_dlb(fplog, cr, step);
                     bDoDLB = TRUE;
                 }
             }
@@ -9120,66 +9423,66 @@ void dd_partition_system(FILE            *fplog,
     if (bMasterState)
     {
         /* Clear the old state */
-        clear_dd_indices(dd,0,0);
-
-        set_ddbox(dd,bMasterState,cr,ir,state_global->box,
-                  TRUE,cgs_gl,state_global->x,&ddbox);
-    
-        get_cg_distribution(fplog,step,dd,cgs_gl,
-                            state_global->box,&ddbox,state_global->x);
-        
-        dd_distribute_state(dd,cgs_gl,
-                            state_global,state_local,f);
-        
-        dd_make_local_cgs(dd,&top_local->cgs);
-        
+        clear_dd_indices(dd, 0, 0);
+        ncgindex_set = 0;
+
+        set_ddbox(dd, bMasterState, cr, ir, state_global->box,
+                  TRUE, cgs_gl, state_global->x, &ddbox);
+
+        get_cg_distribution(fplog, step, dd, cgs_gl,
+                            state_global->box, &ddbox, state_global->x);
+
+        dd_distribute_state(dd, cgs_gl,
+                            state_global, state_local, f);
+
+        dd_make_local_cgs(dd, &top_local->cgs);
+
         /* Ensure that we have space for the new distribution */
-        dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
+        dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
 
         if (fr->cutoff_scheme == ecutsGROUP)
         {
-            calc_cgcm(fplog,0,dd->ncg_home,
-                      &top_local->cgs,state_local->x,fr->cg_cm);
+            calc_cgcm(fplog, 0, dd->ncg_home,
+                      &top_local->cgs, state_local->x, fr->cg_cm);
         }
-        
-        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
-        
-        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
 
-        cg0 = 0;
+        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
+
+        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
     }
     else if (state_local->ddp_count != dd->ddp_count)
     {
         if (state_local->ddp_count > dd->ddp_count)
         {
-            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
+            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
         }
-        
+
         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
         {
-            gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
+            gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
         }
-        
+
         /* Clear the old state */
-        clear_dd_indices(dd,0,0);
-        
+        clear_dd_indices(dd, 0, 0);
+
         /* Build the new indices */
-        rebuild_cgindex(dd,cgs_gl->index,state_local);
-        make_dd_indices(dd,cgs_gl->index,0);
+        rebuild_cgindex(dd, cgs_gl->index, state_local);
+        make_dd_indices(dd, cgs_gl->index, 0);
+        ncgindex_set = dd->ncg_home;
 
         if (fr->cutoff_scheme == ecutsGROUP)
         {
             /* Redetermine the cg COMs */
-            calc_cgcm(fplog,0,dd->ncg_home,
-                      &top_local->cgs,state_local->x,fr->cg_cm);
+            calc_cgcm(fplog, 0, dd->ncg_home,
+                      &top_local->cgs, state_local->x, fr->cg_cm);
         }
-        
-        inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
 
-        dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
+        inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
 
-        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
-                  TRUE,&top_local->cgs,state_local->x,&ddbox);
+        dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
+
+        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
+                  TRUE, &top_local->cgs, state_local->x, &ddbox);
 
         bRedist = comm->bDynLoadBal;
     }
@@ -9188,32 +9491,33 @@ void dd_partition_system(FILE            *fplog,
         /* We have the full state, only redistribute the cgs */
 
         /* Clear the non-home indices */
-        clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
+        clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
+        ncgindex_set = 0;
 
         /* Avoid global communication for dim's without pbc and -gcom */
         if (!bNStGlobalComm)
         {
-            copy_rvec(comm->box0    ,ddbox.box0    );
-            copy_rvec(comm->box_size,ddbox.box_size);
+            copy_rvec(comm->box0ddbox.box0    );
+            copy_rvec(comm->box_size, ddbox.box_size);
         }
-        set_ddbox(dd,bMasterState,cr,ir,state_local->box,
-                  bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
+        set_ddbox(dd, bMasterState, cr, ir, state_local->box,
+                  bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
 
         bBoxChanged = TRUE;
-        bRedist = TRUE;
+        bRedist     = TRUE;
     }
     /* For dim's without pbc and -gcom */
-    copy_rvec(ddbox.box0    ,comm->box0    );
-    copy_rvec(ddbox.box_size,comm->box_size);
-    
-    set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
-                      step,wcycle);
-    
+    copy_rvec(ddbox.box0comm->box0    );
+    copy_rvec(ddbox.box_size, comm->box_size);
+
+    set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
+                      step, wcycle);
+
     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
     {
-        write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
+        write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
     }
-    
+
     /* Check if we should sort the charge groups */
     if (comm->nstSortCG > 0)
     {
@@ -9230,53 +9534,53 @@ void dd_partition_system(FILE            *fplog,
     ncg_moved = 0;
     if (bRedist)
     {
-        wallcycle_sub_start(wcycle,ewcsDD_REDIST);
+        wallcycle_sub_start(wcycle, ewcsDD_REDIST);
 
-        dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
-                           state_local,f,fr,mdatoms,
-                           !bSortCG,nrnb,&cg0,&ncg_moved);
+        dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
+                           state_local, f, fr,
+                           !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
 
-        wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
+        wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
     }
-    
-    get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
-                          dd,&ddbox,
-                          &comm->cell_x0,&comm->cell_x1,
-                          dd->ncg_home,fr->cg_cm,
-                          cell_ns_x0,cell_ns_x1,&grid_density);
+
+    get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
+                          dd, &ddbox,
+                          &comm->cell_x0, &comm->cell_x1,
+                          dd->ncg_home, fr->cg_cm,
+                          cell_ns_x0, cell_ns_x1, &grid_density);
 
     if (bBoxChanged)
     {
-        comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
+        comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
     }
 
     switch (fr->cutoff_scheme)
     {
-    case ecutsGROUP:
-        copy_ivec(fr->ns.grid->n,ncells_old);
-        grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
-                   state_local->box,cell_ns_x0,cell_ns_x1,
-                   fr->rlistlong,grid_density);
-        break;
-    case ecutsVERLET:
-        nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
-        break;
-    default:
-        gmx_incons("unimplemented");
+        case ecutsGROUP:
+            copy_ivec(fr->ns.grid->n, ncells_old);
+            grid_first(fplog, fr->ns.grid, dd, &ddbox,
+                       state_local->box, cell_ns_x0, cell_ns_x1,
+                       fr->rlistlong, grid_density);
+            break;
+        case ecutsVERLET:
+            nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
+            break;
+        default:
+            gmx_incons("unimplemented");
     }
     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
-    copy_ivec(ddbox.tric_dir,comm->tric_dir);
+    copy_ivec(ddbox.tric_dir, comm->tric_dir);
 
     if (bSortCG)
     {
-        wallcycle_sub_start(wcycle,ewcsDD_GRID);
+        wallcycle_sub_start(wcycle, ewcsDD_GRID);
 
         /* Sort the state on charge group position.
          * This enables exact restarts from this step.
          * It also improves performance by about 15% with larger numbers
          * of atoms per node.
          */
-        
+
         /* Fill the ns grid with the home cell,
          * so we can sort with the indices.
          */
@@ -9284,35 +9588,35 @@ void dd_partition_system(FILE            *fplog,
 
         switch (fr->cutoff_scheme)
         {
-        case ecutsVERLET:
-            set_zones_size(dd,state_local->box,&ddbox,0,1);
-
-            nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
-                              0,
-                              comm->zones.size[0].bb_x0,
-                              comm->zones.size[0].bb_x1,
-                              0,dd->ncg_home,
-                              comm->zones.dens_zone0,
-                              fr->cginfo,
-                              state_local->x,
-                              ncg_moved,comm->moved,
-                              fr->nbv->grp[eintLocal].kernel_type,
-                              fr->nbv->grp[eintLocal].nbat);
-
-            nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
-            break;
-        case ecutsGROUP:
-            fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
-                      0,dd->ncg_home,fr->cg_cm);
-            
-            copy_ivec(fr->ns.grid->n,ncells_new);
-            break;
-        default:
-            gmx_incons("unimplemented");
+            case ecutsVERLET:
+                set_zones_size(dd, state_local->box, &ddbox, 0, 1);
+
+                nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
+                                  0,
+                                  comm->zones.size[0].bb_x0,
+                                  comm->zones.size[0].bb_x1,
+                                  0, dd->ncg_home,
+                                  comm->zones.dens_zone0,
+                                  fr->cginfo,
+                                  state_local->x,
+                                  ncg_moved, bRedist ? comm->moved : NULL,
+                                  fr->nbv->grp[eintLocal].kernel_type,
+                                  fr->nbv->grp[eintLocal].nbat);
+
+                nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
+                break;
+            case ecutsGROUP:
+                fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
+                          0, dd->ncg_home, fr->cg_cm);
+
+                copy_ivec(fr->ns.grid->n, ncells_new);
+                break;
+            default:
+                gmx_incons("unimplemented");
         }
 
         bResortAll = bMasterState;
-   
+
         /* Check if we can user the old order and ns grid cell indices
          * of the charge groups to sort the charge groups efficiently.
          */
@@ -9325,89 +9629,89 @@ void dd_partition_system(FILE            *fplog,
 
         if (debug)
         {
-            fprintf(debug,"Step %s, sorting the %d home charge groups\n",
-                    gmx_step_str(step,sbuf),dd->ncg_home);
+            fprintf(debug, "Step %s, sorting the %d home charge groups\n",
+                    gmx_step_str(step, sbuf), dd->ncg_home);
         }
-        dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
+        dd_sort_state(dd, fr->cg_cm, fr, state_local,
                       bResortAll ? -1 : ncg_home_old);
         /* Rebuild all the indices */
-        cg0 = 0;
         ga2la_clear(dd->ga2la);
+        ncgindex_set = 0;
 
-        wallcycle_sub_stop(wcycle,ewcsDD_GRID);
+        wallcycle_sub_stop(wcycle, ewcsDD_GRID);
     }
 
-    wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
-    
+    wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
+
     /* Setup up the communication and communicate the coordinates */
-    setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
-    
+    setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
+
     /* Set the indices */
-    make_dd_indices(dd,cgs_gl->index,cg0);
+    make_dd_indices(dd, cgs_gl->index, ncgindex_set);
 
     /* Set the charge group boundaries for neighbor searching */
     set_cg_boundaries(&comm->zones);
 
     if (fr->cutoff_scheme == ecutsVERLET)
     {
-        set_zones_size(dd,state_local->box,&ddbox,
-                       bSortCG ? 1 : 0,comm->zones.n);
+        set_zones_size(dd, state_local->box, &ddbox,
+                       bSortCG ? 1 : 0, comm->zones.n);
     }
 
-    wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
+    wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
 
     /*
-    write_dd_pdb("dd_home",step,"dump",top_global,cr,
+       write_dd_pdb("dd_home",step,"dump",top_global,cr,
                  -1,state_local->x,state_local->box);
-    */
+     */
+
+    wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
 
-    wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
-    
     /* Extract a local topology from the global topology */
-    for(i=0; i<dd->ndim; i++)
+    for (i = 0; i < dd->ndim; i++)
     {
         np[dd->dim[i]] = comm->cd[i].np;
     }
-    dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
-                      comm->cellsize_min,np,
+    dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
+                      comm->cellsize_min, np,
                       fr,
-                      fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
-                      vsite,top_global,top_local);
+                      fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
+                      vsite, top_global, top_local);
+
+    wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
 
-    wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
+    wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
 
-    wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
-    
     /* Set up the special atom communication */
     n = comm->nat[ddnatZONE];
-    for(i=ddnatZONE+1; i<ddnatNR; i++)
+    for (i = ddnatZONE+1; i < ddnatNR; i++)
     {
-        switch(i)
+        switch (i)
         {
-        case ddnatVSITE:
-            if (vsite && vsite->n_intercg_vsite)
-            {
-                n = dd_make_local_vsites(dd,n,top_local->idef.il);
-            }
-            break;
-        case ddnatCON:
-            if (dd->bInterCGcons || dd->bInterCGsettles)
-            {
-                /* Only for inter-cg constraints we need special code */
-                n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
-                                              constr,ir->nProjOrder,
-                                              top_local->idef.il);
-            }
-            break;
-        default:
-            gmx_incons("Unknown special atom type setup");
+            case ddnatVSITE:
+                if (vsite && vsite->n_intercg_vsite)
+                {
+                    n = dd_make_local_vsites(dd, n, top_local->idef.il);
+                }
+                break;
+            case ddnatCON:
+                if (dd->bInterCGcons || dd->bInterCGsettles)
+                {
+                    /* Only for inter-cg constraints we need special code */
+                    n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
+                                                  constr, ir->nProjOrder,
+                                                  top_local->idef.il);
+                }
+                break;
+            default:
+                gmx_incons("Unknown special atom type setup");
         }
         comm->nat[i] = n;
     }
 
-    wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
+    wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
 
-    wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
+    wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
 
     /* Make space for the extra coordinates for virtual site
      * or constraint communication.
@@ -9415,7 +9719,7 @@ void dd_partition_system(FILE            *fplog,
     state_local->natoms = comm->nat[ddnatNR-1];
     if (state_local->natoms > state_local->nalloc)
     {
-        dd_realloc_state(state_local,f,state_local->natoms);
+        dd_realloc_state(state_local, f, state_local->natoms);
     }
 
     if (fr->bF_NoVirSum)
@@ -9447,82 +9751,101 @@ void dd_partition_system(FILE            *fplog,
      * avoid some allocation, zeroing and copying, but this is
      * probably not worth the complications ande checking.
      */
-    forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
-                        dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
+    forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
+                        dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
 
     /* We make the all mdatoms up to nat_tot_con.
      * We could save some work by only setting invmass
      * between nat_tot and nat_tot_con.
      */
     /* This call also sets the new number of home particles to dd->nat_home */
-    atoms2md(top_global,ir,
-             comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
+    atoms2md(top_global, ir,
+             comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
 
     /* Now we have the charges we can sort the FE interactions */
-    dd_sort_local_top(dd,mdatoms,top_local);
+    dd_sort_local_top(dd, mdatoms, top_local);
+
+    if (vsite != NULL)
+    {
+        /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
+        split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
+                                  mdatoms, FALSE, vsite);
+    }
 
     if (shellfc)
     {
         /* Make the local shell stuff, currently no communication is done */
-        make_local_shells(cr,mdatoms,shellfc);
+        make_local_shells(cr, mdatoms, shellfc);
     }
-    
-       if (ir->implicit_solvent)
+
+    if (ir->implicit_solvent)
     {
-        make_local_gb(cr,fr->born,ir->gb_algorithm);
+        make_local_gb(cr, fr->born, ir->gb_algorithm);
     }
 
-    init_bonded_thread_force_reduction(fr,&top_local->idef);
+    setup_bonded_threading(fr, &top_local->idef);
 
     if (!(cr->duty & DUTY_PME))
     {
-        /* Send the charges to our PME only node */
-        gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
-                       mdatoms->chargeA,mdatoms->chargeB,
-                       dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
+        /* Send the charges and/or c6/sigmas to our PME only node */
+        gmx_pme_send_parameters(cr,
+                                fr->ic,
+                                mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
+                                mdatoms->chargeA, mdatoms->chargeB,
+                                mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
+                                mdatoms->sigmaA, mdatoms->sigmaB,
+                                dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
     }
-    
+
     if (constr)
     {
-        set_constraints(constr,top_local,ir,mdatoms,cr);
+        set_constraints(constr, top_local, ir, mdatoms, cr);
     }
-    
+
     if (ir->ePull != epullNO)
     {
         /* Update the local pull groups */
-        dd_make_local_pull_groups(dd,ir->pull,mdatoms);
+        dd_make_local_pull_groups(dd, ir->pull, mdatoms);
     }
-    
+
     if (ir->bRot)
     {
         /* Update the local rotation groups */
-        dd_make_local_rotation_groups(dd,ir->rot);
+        dd_make_local_rotation_groups(dd, ir->rot);
+    }
+
+    if (ir->eSwapCoords != eswapNO)
+    {
+        /* Update the local groups needed for ion swapping */
+        dd_make_local_swap_groups(dd, ir->swap);
     }
 
+    /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
+    dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
 
     add_dd_statistics(dd);
-    
+
     /* Make sure we only count the cycles for this DD partitioning */
     clear_dd_cycle_counts(dd);
-    
+
     /* Because the order of the atoms might have changed since
      * the last vsite construction, we need to communicate the constructing
      * atom coordinates again (for spreading the forces this MD step).
      */
-    dd_move_x_vsites(dd,state_local->box,state_local->x);
+    dd_move_x_vsites(dd, state_local->box, state_local->x);
+
+    wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
 
-    wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
-    
     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
     {
-        dd_move_x(dd,state_local->box,state_local->x);
-        write_dd_pdb("dd_dump",step,"dump",top_global,cr,
-                     -1,state_local->x,state_local->box);
+        dd_move_x(dd, state_local->box, state_local->x);
+        write_dd_pdb("dd_dump", step, "dump", top_global, cr,
+                     -1, state_local->x, state_local->box);
     }
 
     /* Store the partitioning step */
     comm->partition_step = step;
-    
+
     /* Increase the DD partitioning counter */
     dd->ddp_count++;
     /* The state currently matches this DD partitioning count, store it */
@@ -9538,7 +9861,7 @@ void dd_partition_system(FILE            *fplog,
     if (comm->DD_debug > 0)
     {
         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
-        check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
+        check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
                                 "after partitioning");
     }
 }