Code beautification with uncrustify
[alexxy/gromacs.git] / src / gromacs / mdlib / pme.c
index 2625e16d3a24003d40b75401a3bef8fdbc3bc49c..51cc32c7a545ebc13cb9b9da80affae73a4e68d6 100644 (file)
@@ -137,102 +137,102 @@ typedef struct {
 
 typedef struct {
 #ifdef GMX_MPI
-    MPI_Comm mpi_comm;
+    MPI_Comm         mpi_comm;
 #endif
-    int  nnodes,nodeid;
-    int  *s2g0;
-    int  *s2g1;
-    int  noverlap_nodes;
-    int  *send_id,*recv_id;
-    int  send_size;             /* Send buffer width, used with OpenMP */
+    int              nnodes, nodeid;
+    int             *s2g0;
+    int             *s2g1;
+    int              noverlap_nodes;
+    int             *send_id, *recv_id;
+    int              send_size; /* Send buffer width, used with OpenMP */
     pme_grid_comm_t *comm_data;
-    real *sendbuf;
-    real *recvbuf;
+    real            *sendbuf;
+    real            *recvbuf;
 } pme_overlap_t;
 
 typedef struct {
-    int *n;     /* Cumulative counts of the number of particles per thread */
-    int nalloc; /* Allocation size of i */
-    int *i;     /* Particle indices ordered on thread index (n) */
+    int *n;      /* Cumulative counts of the number of particles per thread */
+    int  nalloc; /* Allocation size of i */
+    int *i;      /* Particle indices ordered on thread index (n) */
 } thread_plist_t;
 
 typedef struct {
-    int  *thread_one;
-    int  n;
-    int  *ind;
+    int      *thread_one;
+    int       n;
+    int      *ind;
     splinevec theta;
-    real *ptr_theta_z;
+    real     *ptr_theta_z;
     splinevec dtheta;
-    real *ptr_dtheta_z;
+    real     *ptr_dtheta_z;
 } splinedata_t;
 
 typedef struct {
-    int  dimind;            /* The index of the dimension, 0=x, 1=y */
-    int  nslab;
-    int  nodeid;
+    int      dimind;        /* The index of the dimension, 0=x, 1=y */
+    int      nslab;
+    int      nodeid;
 #ifdef GMX_MPI
     MPI_Comm mpi_comm;
 #endif
 
-    int  *node_dest;        /* The nodes to send x and q to with DD */
-    int  *node_src;         /* The nodes to receive x and q from with DD */
-    int  *buf_index;        /* Index for commnode into the buffers */
+    int     *node_dest;     /* The nodes to send x and q to with DD */
+    int     *node_src;      /* The nodes to receive x and q from with DD */
+    int     *buf_index;     /* Index for commnode into the buffers */
 
-    int  maxshift;
+    int      maxshift;
 
-    int  npd;
-    int  pd_nalloc;
-    int  *pd;
-    int  *count;            /* The number of atoms to send to each node */
-    int  **count_thread;
-    int  *rcount;           /* The number of atoms to receive */
-
-    int  n;
-    int  nalloc;
-    rvec *x;
-    real *q;
-    rvec *f;
+    int      npd;
+    int      pd_nalloc;
+    int     *pd;
+    int     *count;         /* The number of atoms to send to each node */
+    int    **count_thread;
+    int     *rcount;        /* The number of atoms to receive */
+
+    int      n;
+    int      nalloc;
+    rvec    *x;
+    real    *q;
+    rvec    *f;
     gmx_bool bSpread;       /* These coordinates are used for spreading */
-    int  pme_order;
-    ivec *idx;
-    rvec *fractx;            /* Fractional coordinate relative to the
-                              * lower cell boundary
-                              */
-    int  nthread;
-    int  *thread_idx;        /* Which thread should spread which charge */
+    int      pme_order;
+    ivec    *idx;
+    rvec    *fractx;            /* Fractional coordinate relative to the
+                                 * lower cell boundary
+                                 */
+    int             nthread;
+    int            *thread_idx; /* Which thread should spread which charge */
     thread_plist_t *thread_plist;
-    splinedata_t *spline;
+    splinedata_t   *spline;
 } pme_atomcomm_t;
 
 #define FLBS  3
 #define FLBSZ 4
 
 typedef struct {
-    ivec ci;     /* The spatial location of this grid         */
-    ivec n;      /* The used size of *grid, including order-1 */
-    ivec offset; /* The grid offset from the full node grid   */
-    int  order;  /* PME spreading order                       */
-    ivec s;      /* The allocated size of *grid, s >= n       */
-    real *grid;  /* The grid local thread, size n             */
+    ivec  ci;     /* The spatial location of this grid         */
+    ivec  n;      /* The used size of *grid, including order-1 */
+    ivec  offset; /* The grid offset from the full node grid   */
+    int   order;  /* PME spreading order                       */
+    ivec  s;      /* The allocated size of *grid, s >= n       */
+    real *grid;   /* The grid local thread, size n             */
 } pmegrid_t;
 
 typedef struct {
-    pmegrid_t grid;     /* The full node grid (non thread-local)            */
-    int  nthread;       /* The number of threads operating on this grid     */
-    ivec nc;            /* The local spatial decomposition over the threads */
-    pmegrid_t *grid_th; /* Array of grids for each thread                   */
-    real *grid_all;     /* Allocated array for the grids in *grid_th        */
-    int  **g2t;         /* The grid to thread index                         */
-    ivec nthread_comm;  /* The number of threads to communicate with        */
+    pmegrid_t  grid;         /* The full node grid (non thread-local)            */
+    int        nthread;      /* The number of threads operating on this grid     */
+    ivec       nc;           /* The local spatial decomposition over the threads */
+    pmegrid_t *grid_th;      /* Array of grids for each thread                   */
+    real      *grid_all;     /* Allocated array for the grids in *grid_th        */
+    int      **g2t;          /* The grid to thread index                         */
+    ivec       nthread_comm; /* The number of threads to communicate with        */
 } pmegrids_t;
 
 
 typedef struct {
 #ifdef PME_SSE
     /* Masks for SSE aligned spreading and gathering */
-    __m128 mask_SSE0[6],mask_SSE1[6];
+    __m128 mask_SSE0[6], mask_SSE1[6];
 #else
-    int dummy; /* C89 requires that struct has at least one member */
+    int    dummy; /* C89 requires that struct has at least one member */
 #endif
 } pme_spline_work_t;
 
@@ -254,68 +254,68 @@ typedef struct {
 } pme_work_t;
 
 typedef struct gmx_pme {
-    int  ndecompdim;         /* The number of decomposition dimensions */
-    int  nodeid;             /* Our nodeid in mpi->mpi_comm */
-    int  nodeid_major;
-    int  nodeid_minor;
-    int  nnodes;             /* The number of nodes doing PME */
-    int  nnodes_major;
-    int  nnodes_minor;
-
-    MPI_Comm mpi_comm;
-    MPI_Comm mpi_comm_d[2];  /* Indexed on dimension, 0=x, 1=y */
+    int           ndecompdim; /* The number of decomposition dimensions */
+    int           nodeid;     /* Our nodeid in mpi->mpi_comm */
+    int           nodeid_major;
+    int           nodeid_minor;
+    int           nnodes;    /* The number of nodes doing PME */
+    int           nnodes_major;
+    int           nnodes_minor;
+
+    MPI_Comm      mpi_comm;
+    MPI_Comm      mpi_comm_d[2]; /* Indexed on dimension, 0=x, 1=y */
 #ifdef GMX_MPI
-    MPI_Datatype  rvec_mpi;  /* the pme vector's MPI type */
+    MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 #endif
 
-    int  nthread;            /* The number of threads doing PME */
+    int        nthread;       /* The number of threads doing PME */
 
-    gmx_bool bPPnode;        /* Node also does particle-particle forces */
-    gmx_bool bFEP;           /* Compute Free energy contribution */
-    int nkx,nky,nkz;         /* Grid dimensions */
-    gmx_bool bP3M;           /* Do P3M: optimize the influence function */
-    int pme_order;
-    real epsilon_r;
+    gmx_bool   bPPnode;       /* Node also does particle-particle forces */
+    gmx_bool   bFEP;          /* Compute Free energy contribution */
+    int        nkx, nky, nkz; /* Grid dimensions */
+    gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
+    int        pme_order;
+    real       epsilon_r;
 
     pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
     pmegrids_t pmegridB;
     /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
-    int     pmegrid_nx,pmegrid_ny,pmegrid_nz;
+    int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
     /* pmegrid_nz might be larger than strictly necessary to ensure
      * memory alignment, pmegrid_nz_base gives the real base size.
      */
     int     pmegrid_nz_base;
     /* The local PME grid starting indices */
-    int     pmegrid_start_ix,pmegrid_start_iy,pmegrid_start_iz;
+    int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 
     /* Work data for spreading and gathering */
-    pme_spline_work_t *spline_work;
+    pme_spline_work_t    *spline_work;
 
-    real *fftgridA;             /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
-    real *fftgridB;             /* inside the interpolation grid, but separate for 2D PME decomp. */
-    int   fftgrid_nx,fftgrid_ny,fftgrid_nz;
+    real                 *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
+    real                 *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
+    int                   fftgrid_nx, fftgrid_ny, fftgrid_nz;
 
-    t_complex *cfftgridA;             /* Grids for complex FFT data */
-    t_complex *cfftgridB;
-    int   cfftgrid_nx,cfftgrid_ny,cfftgrid_nz;
+    t_complex            *cfftgridA;  /* Grids for complex FFT data */
+    t_complex            *cfftgridB;
+    int                   cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 
     gmx_parallel_3dfft_t  pfft_setupA;
     gmx_parallel_3dfft_t  pfft_setupB;
 
-    int  *nnx,*nny,*nnz;
-    real *fshx,*fshy,*fshz;
+    int                  *nnx, *nny, *nnz;
+    real                 *fshx, *fshy, *fshz;
 
-    pme_atomcomm_t atc[2];  /* Indexed on decomposition index */
-    matrix    recipbox;
-    splinevec bsp_mod;
+    pme_atomcomm_t        atc[2]; /* Indexed on decomposition index */
+    matrix                recipbox;
+    splinevec             bsp_mod;
 
-    pme_overlap_t overlap[2]; /* Indexed on dimension, 0=x, 1=y */
+    pme_overlap_t         overlap[2]; /* Indexed on dimension, 0=x, 1=y */
 
-    pme_atomcomm_t atc_energy; /* Only for gmx_pme_calc_energy */
+    pme_atomcomm_t        atc_energy; /* Only for gmx_pme_calc_energy */
 
-    rvec *bufv;             /* Communication buffer */
-    real *bufr;             /* Communication buffer */
-    int  buf_nalloc;        /* The communication buffer size */
+    rvec                 *bufv;       /* Communication buffer */
+    real                 *bufr;       /* Communication buffer */
+    int                   buf_nalloc; /* The communication buffer size */
 
     /* thread local work data for solve_pme */
     pme_work_t *work;
@@ -337,21 +337,21 @@ typedef struct gmx_pme {
 } t_gmx_pme;
 
 
-static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
-                                   int start,int end,int thread)
+static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
+                                   int start, int end, int thread)
 {
-    int  i;
-    int  *idxptr,tix,tiy,tiz;
-    real *xptr,*fptr,tx,ty,tz;
-    real rxx,ryx,ryy,rzx,rzy,rzz;
-    int  nx,ny,nz;
-    int  start_ix,start_iy,start_iz;
-    int  *g2tx,*g2ty,*g2tz;
-    gmx_bool bThreads;
-    int  *thread_idx=NULL;
-    thread_plist_t *tpl=NULL;
-    int  *tpl_n=NULL;
-    int  thread_i;
+    int             i;
+    int            *idxptr, tix, tiy, tiz;
+    real           *xptr, *fptr, tx, ty, tz;
+    real            rxx, ryx, ryy, rzx, rzy, rzz;
+    int             nx, ny, nz;
+    int             start_ix, start_iy, start_iz;
+    int            *g2tx, *g2ty, *g2tz;
+    gmx_bool        bThreads;
+    int            *thread_idx = NULL;
+    thread_plist_t *tpl        = NULL;
+    int            *tpl_n      = NULL;
+    int             thread_i;
 
     nx  = pme->nkx;
     ny  = pme->nky;
@@ -379,13 +379,14 @@ static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
 
         tpl   = &atc->thread_plist[thread];
         tpl_n = tpl->n;
-        for(i=0; i<atc->nthread; i++)
+        for (i = 0; i < atc->nthread; i++)
         {
             tpl_n[i] = 0;
         }
     }
 
-    for(i=start; i<end; i++) {
+    for (i = start; i < end; i++)
+    {
         xptr   = atc->x[i];
         idxptr = atc->idx[i];
         fptr   = atc->fractx[i];
@@ -411,14 +412,14 @@ static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
         idxptr[ZZ] = pme->nnz[tiz];
 
 #ifdef DEBUG
-        range_check(idxptr[XX],0,pme->pmegrid_nx);
-        range_check(idxptr[YY],0,pme->pmegrid_ny);
-        range_check(idxptr[ZZ],0,pme->pmegrid_nz);
+        range_check(idxptr[XX], 0, pme->pmegrid_nx);
+        range_check(idxptr[YY], 0, pme->pmegrid_ny);
+        range_check(idxptr[ZZ], 0, pme->pmegrid_nz);
 #endif
 
         if (bThreads)
         {
-            thread_i = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
+            thread_i      = g2tx[idxptr[XX]] + g2ty[idxptr[YY]] + g2tz[idxptr[ZZ]];
             thread_idx[i] = thread_i;
             tpl_n[thread_i]++;
         }
@@ -429,7 +430,7 @@ static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
         /* Make a list of particle indices sorted on thread */
 
         /* Get the cumulative count */
-        for(i=1; i<atc->nthread; i++)
+        for (i = 1; i < atc->nthread; i++)
         {
             tpl_n[i] += tpl_n[i-1];
         }
@@ -440,17 +441,17 @@ static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
         if (tpl_n[atc->nthread-1] > tpl->nalloc)
         {
             tpl->nalloc = over_alloc_large(tpl_n[atc->nthread-1]);
-            srenew(tpl->i,tpl->nalloc);
+            srenew(tpl->i, tpl->nalloc);
         }
         /* Set tpl_n to the cumulative start */
-        for(i=atc->nthread-1; i>=1; i--)
+        for (i = atc->nthread-1; i >= 1; i--)
         {
             tpl_n[i] = tpl_n[i-1];
         }
         tpl_n[0] = 0;
 
         /* Fill our thread local array with indices sorted on thread */
-        for(i=start; i<end; i++)
+        for (i = start; i < end; i++)
         {
             tpl->i[tpl_n[atc->thread_idx[i]]++] = i;
         }
@@ -459,16 +460,16 @@ static void calc_interpolation_idx(gmx_pme_t pme,pme_atomcomm_t *atc,
 }
 
 static void make_thread_local_ind(pme_atomcomm_t *atc,
-                                  int thread,splinedata_t *spline)
+                                  int thread, splinedata_t *spline)
 {
-    int  n,t,i,start,end;
+    int             n, t, i, start, end;
     thread_plist_t *tpl;
 
     /* Combine the indices made by each thread into one index */
 
-    n = 0;
+    n     = 0;
     start = 0;
-    for(t=0; t<atc->nthread; t++)
+    for (t = 0; t < atc->nthread; t++)
     {
         tpl = &atc->thread_plist[t];
         /* Copy our part (start - end) from the list of thread t */
@@ -477,7 +478,7 @@ static void make_thread_local_ind(pme_atomcomm_t *atc,
             start = tpl->n[thread-1];
         }
         end = tpl->n[thread];
-        for(i=start; i<end; i++)
+        for (i = start; i < end; i++)
         {
             spline->ind[n++] = tpl->i[i];
         }
@@ -491,11 +492,11 @@ static void pme_calc_pidx(int start, int end,
                           matrix recipbox, rvec x[],
                           pme_atomcomm_t *atc, int *count)
 {
-    int  nslab,i;
-    int  si;
-    real *xptr,s;
-    real rxx,ryx,rzx,ryy,rzy;
-    int *pd;
+    int   nslab, i;
+    int   si;
+    real *xptr, s;
+    real  rxx, ryx, rzx, ryy, rzy;
+    int  *pd;
 
     /* Calculate PME task index (pidx) for each grid index.
      * Here we always assign equally sized slabs to each node
@@ -506,7 +507,7 @@ static void pme_calc_pidx(int start, int end,
     pd    = atc->pd;
 
     /* Reset the count */
-    for(i=0; i<nslab; i++)
+    for (i = 0; i < nslab; i++)
     {
         count[i] = 0;
     }
@@ -517,12 +518,12 @@ static void pme_calc_pidx(int start, int end,
         ryx = recipbox[YY][XX];
         rzx = recipbox[ZZ][XX];
         /* Calculate the node index in x-dimension */
-        for(i=start; i<end; i++)
+        for (i = start; i < end; i++)
         {
             xptr   = x[i];
             /* Fractional coordinates along box vectors */
-            s = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
-            si = (int)(s + 2*nslab) % nslab;
+            s     = nslab*(xptr[XX]*rxx + xptr[YY]*ryx + xptr[ZZ]*rzx);
+            si    = (int)(s + 2*nslab) % nslab;
             pd[i] = si;
             count[si]++;
         }
@@ -532,12 +533,12 @@ static void pme_calc_pidx(int start, int end,
         ryy = recipbox[YY][YY];
         rzy = recipbox[ZZ][YY];
         /* Calculate the node index in y-dimension */
-        for(i=start; i<end; i++)
+        for (i = start; i < end; i++)
         {
             xptr   = x[i];
             /* Fractional coordinates along box vectors */
-            s = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
-            si = (int)(s + 2*nslab) % nslab;
+            s     = nslab*(xptr[YY]*ryy + xptr[ZZ]*rzy);
+            si    = (int)(s + 2*nslab) % nslab;
             pd[i] = si;
             count[si]++;
         }
@@ -547,40 +548,40 @@ static void pme_calc_pidx(int start, int end,
 static void pme_calc_pidx_wrapper(int natoms, matrix recipbox, rvec x[],
                                   pme_atomcomm_t *atc)
 {
-    int nthread,thread,slab;
+    int nthread, thread, slab;
 
     nthread = atc->nthread;
 
 #pragma omp parallel for num_threads(nthread) schedule(static)
-    for(thread=0; thread<nthread; thread++)
+    for (thread = 0; thread < nthread; thread++)
     {
         pme_calc_pidx(natoms* thread   /nthread,
                       natoms*(thread+1)/nthread,
-                      recipbox,x,atc,atc->count_thread[thread]);
+                      recipbox, x, atc, atc->count_thread[thread]);
     }
     /* Non-parallel reduction, since nslab is small */
 
-    for(thread=1; thread<nthread; thread++)
+    for (thread = 1; thread < nthread; thread++)
     {
-        for(slab=0; slab<atc->nslab; slab++)
+        for (slab = 0; slab < atc->nslab; slab++)
         {
             atc->count_thread[0][slab] += atc->count_thread[thread][slab];
         }
     }
 }
 
-static void realloc_splinevec(splinevec th,real **ptr_z,int nalloc)
+static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 {
-    const int padding=4;
-    int i;
+    const int padding = 4;
+    int       i;
 
-    srenew(th[XX],nalloc);
-    srenew(th[YY],nalloc);
+    srenew(th[XX], nalloc);
+    srenew(th[YY], nalloc);
     /* In z we add padding, this is only required for the aligned SSE code */
-    srenew(*ptr_z,nalloc+2*padding);
+    srenew(*ptr_z, nalloc+2*padding);
     th[ZZ] = *ptr_z + padding;
 
-    for(i=0; i<padding; i++)
+    for (i = 0; i < padding; i++)
     {
         (*ptr_z)[               i] = 0;
         (*ptr_z)[padding+nalloc+i] = 0;
@@ -589,54 +590,56 @@ static void realloc_splinevec(splinevec th,real **ptr_z,int nalloc)
 
 static void pme_realloc_splinedata(splinedata_t *spline, pme_atomcomm_t *atc)
 {
-    int i,d;
+    int i, d;
 
-    srenew(spline->ind,atc->nalloc);
+    srenew(spline->ind, atc->nalloc);
     /* Initialize the index to identity so it works without threads */
-    for(i=0; i<atc->nalloc; i++)
+    for (i = 0; i < atc->nalloc; i++)
     {
         spline->ind[i] = i;
     }
 
-    realloc_splinevec(spline->theta,&spline->ptr_theta_z,
+    realloc_splinevec(spline->theta, &spline->ptr_theta_z,
                       atc->pme_order*atc->nalloc);
-    realloc_splinevec(spline->dtheta,&spline->ptr_dtheta_z,
+    realloc_splinevec(spline->dtheta, &spline->ptr_dtheta_z,
                       atc->pme_order*atc->nalloc);
 }
 
 static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
 {
-    int nalloc_old,i,j,nalloc_tpl;
+    int nalloc_old, i, j, nalloc_tpl;
 
     /* We have to avoid a NULL pointer for atc->x to avoid
      * possible fatal errors in MPI routines.
      */
     if (atc->n > atc->nalloc || atc->nalloc == 0)
     {
-        nalloc_old = atc->nalloc;
-        atc->nalloc = over_alloc_dd(max(atc->n,1));
+        nalloc_old  = atc->nalloc;
+        atc->nalloc = over_alloc_dd(max(atc->n, 1));
 
-        if (atc->nslab > 1) {
-            srenew(atc->x,atc->nalloc);
-            srenew(atc->q,atc->nalloc);
-            srenew(atc->f,atc->nalloc);
-            for(i=nalloc_old; i<atc->nalloc; i++)
+        if (atc->nslab > 1)
+        {
+            srenew(atc->x, atc->nalloc);
+            srenew(atc->q, atc->nalloc);
+            srenew(atc->f, atc->nalloc);
+            for (i = nalloc_old; i < atc->nalloc; i++)
             {
                 clear_rvec(atc->f[i]);
             }
         }
-        if (atc->bSpread) {
-            srenew(atc->fractx,atc->nalloc);
-            srenew(atc->idx   ,atc->nalloc);
+        if (atc->bSpread)
+        {
+            srenew(atc->fractx, atc->nalloc);
+            srenew(atc->idx, atc->nalloc);
 
             if (atc->nthread > 1)
             {
-                srenew(atc->thread_idx,atc->nalloc);
+                srenew(atc->thread_idx, atc->nalloc);
             }
 
-            for(i=0; i<atc->nthread; i++)
+            for (i = 0; i < atc->nthread; i++)
             {
-                pme_realloc_splinedata(&atc->spline[i],atc);
+                pme_realloc_splinedata(&atc->spline[i], atc);
             }
         }
     }
@@ -649,41 +652,51 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 /* domain decomposition by x coordinate           */
 {
     int *idxa;
-    int i, ii;
-
-    if(FALSE == pme->redist_init) {
-        snew(pme->scounts,atc->nslab);
-        snew(pme->rcounts,atc->nslab);
-        snew(pme->sdispls,atc->nslab);
-        snew(pme->rdispls,atc->nslab);
-        snew(pme->sidx,atc->nslab);
+    int  i, ii;
+
+    if (FALSE == pme->redist_init)
+    {
+        snew(pme->scounts, atc->nslab);
+        snew(pme->rcounts, atc->nslab);
+        snew(pme->sdispls, atc->nslab);
+        snew(pme->rdispls, atc->nslab);
+        snew(pme->sidx, atc->nslab);
         pme->redist_init = TRUE;
     }
-    if (n > pme->redist_buf_nalloc) {
+    if (n > pme->redist_buf_nalloc)
+    {
         pme->redist_buf_nalloc = over_alloc_dd(n);
-        srenew(pme->redist_buf,pme->redist_buf_nalloc*DIM);
+        srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
     }
 
     pme->idxa = atc->pd;
 
 #ifdef GMX_MPI
-    if (forw && bXF) {
+    if (forw && bXF)
+    {
         /* forward, redistribution from pp to pme */
 
         /* Calculate send counts and exchange them with other nodes */
-        for(i=0; (i<atc->nslab); i++) pme->scounts[i]=0;
-        for(i=0; (i<n); i++) pme->scounts[pme->idxa[i]]++;
+        for (i = 0; (i < atc->nslab); i++)
+        {
+            pme->scounts[i] = 0;
+        }
+        for (i = 0; (i < n); i++)
+        {
+            pme->scounts[pme->idxa[i]]++;
+        }
         MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
 
         /* Calculate send and receive displacements and index into send
            buffer */
-        pme->sdispls[0]=0;
-        pme->rdispls[0]=0;
-        pme->sidx[0]=0;
-        for(i=1; i<atc->nslab; i++) {
-            pme->sdispls[i]=pme->sdispls[i-1]+pme->scounts[i-1];
-            pme->rdispls[i]=pme->rdispls[i-1]+pme->rcounts[i-1];
-            pme->sidx[i]=pme->sdispls[i];
+        pme->sdispls[0] = 0;
+        pme->rdispls[0] = 0;
+        pme->sidx[0]    = 0;
+        for (i = 1; i < atc->nslab; i++)
+        {
+            pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
+            pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
+            pme->sidx[i]    = pme->sdispls[i];
         }
         /* Total # of particles to be received */
         atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
@@ -691,39 +704,49 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
         pme_realloc_atomcomm_things(atc);
 
         /* Copy particle coordinates into send buffer and exchange*/
-        for(i=0; (i<n); i++) {
-            ii=DIM*pme->sidx[pme->idxa[i]];
+        for (i = 0; (i < n); i++)
+        {
+            ii = DIM*pme->sidx[pme->idxa[i]];
             pme->sidx[pme->idxa[i]]++;
-            pme->redist_buf[ii+XX]=x_f[i][XX];
-            pme->redist_buf[ii+YY]=x_f[i][YY];
-            pme->redist_buf[ii+ZZ]=x_f[i][ZZ];
+            pme->redist_buf[ii+XX] = x_f[i][XX];
+            pme->redist_buf[ii+YY] = x_f[i][YY];
+            pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
         }
         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
                       pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
                       pme->rvec_mpi, atc->mpi_comm);
     }
-    if (forw) {
+    if (forw)
+    {
         /* Copy charge into send buffer and exchange*/
-        for(i=0; i<atc->nslab; i++) pme->sidx[i]=pme->sdispls[i];
-        for(i=0; (i<n); i++) {
-            ii=pme->sidx[pme->idxa[i]];
+        for (i = 0; i < atc->nslab; i++)
+        {
+            pme->sidx[i] = pme->sdispls[i];
+        }
+        for (i = 0; (i < n); i++)
+        {
+            ii = pme->sidx[pme->idxa[i]];
             pme->sidx[pme->idxa[i]]++;
-            pme->redist_buf[ii]=charge[i];
+            pme->redist_buf[ii] = charge[i];
         }
         MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
                       atc->q, pme->rcounts, pme->rdispls, mpi_type,
                       atc->mpi_comm);
     }
-    else { /* backward, redistribution from pme to pp */
+    else   /* backward, redistribution from pme to pp */
+    {
         MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
                       pme->redist_buf, pme->scounts, pme->sdispls,
                       pme->rvec_mpi, atc->mpi_comm);
 
         /* Copy data from receive buffer */
-        for(i=0; i<atc->nslab; i++)
+        for (i = 0; i < atc->nslab; i++)
+        {
             pme->sidx[i] = pme->sdispls[i];
-        for(i=0; (i<n); i++) {
-            ii = DIM*pme->sidx[pme->idxa[i]];
+        }
+        for (i = 0; (i < n); i++)
+        {
+            ii          = DIM*pme->sidx[pme->idxa[i]];
             x_f[i][XX] += pme->redist_buf[ii+XX];
             x_f[i][YY] += pme->redist_buf[ii+YY];
             x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
@@ -734,36 +757,44 @@ static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
 }
 
 static void pme_dd_sendrecv(pme_atomcomm_t *atc,
-                            gmx_bool bBackward,int shift,
-                            void *buf_s,int nbyte_s,
-                            void *buf_r,int nbyte_r)
+                            gmx_bool bBackward, int shift,
+                            void *buf_s, int nbyte_s,
+                            void *buf_r, int nbyte_r)
 {
 #ifdef GMX_MPI
-    int dest,src;
+    int        dest, src;
     MPI_Status stat;
 
-    if (bBackward == FALSE) {
+    if (bBackward == FALSE)
+    {
         dest = atc->node_dest[shift];
         src  = atc->node_src[shift];
-    } else {
+    }
+    else
+    {
         dest = atc->node_src[shift];
         src  = atc->node_dest[shift];
     }
 
-    if (nbyte_s > 0 && nbyte_r > 0) {
-        MPI_Sendrecv(buf_s,nbyte_s,MPI_BYTE,
-                     dest,shift,
-                     buf_r,nbyte_r,MPI_BYTE,
-                     src,shift,
-                     atc->mpi_comm,&stat);
-    } else if (nbyte_s > 0) {
-        MPI_Send(buf_s,nbyte_s,MPI_BYTE,
-                 dest,shift,
+    if (nbyte_s > 0 && nbyte_r > 0)
+    {
+        MPI_Sendrecv(buf_s, nbyte_s, MPI_BYTE,
+                     dest, shift,
+                     buf_r, nbyte_r, MPI_BYTE,
+                     src, shift,
+                     atc->mpi_comm, &stat);
+    }
+    else if (nbyte_s > 0)
+    {
+        MPI_Send(buf_s, nbyte_s, MPI_BYTE,
+                 dest, shift,
                  atc->mpi_comm);
-    } else if (nbyte_r > 0) {
-        MPI_Recv(buf_r,nbyte_r,MPI_BYTE,
-                 src,shift,
-                 atc->mpi_comm,&stat);
+    }
+    else if (nbyte_r > 0)
+    {
+        MPI_Recv(buf_r, nbyte_r, MPI_BYTE,
+                 src, shift,
+                 atc->mpi_comm, &stat);
     }
 #endif
 }
@@ -772,42 +803,50 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
                              int n, gmx_bool bX, rvec *x, real *charge,
                              pme_atomcomm_t *atc)
 {
-    int *commnode,*buf_index;
-    int nnodes_comm,i,nsend,local_pos,buf_pos,node,scount,rcount;
+    int *commnode, *buf_index;
+    int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
 
     commnode  = atc->node_dest;
     buf_index = atc->buf_index;
 
-    nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
+    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
 
     nsend = 0;
-    for(i=0; i<nnodes_comm; i++) {
+    for (i = 0; i < nnodes_comm; i++)
+    {
         buf_index[commnode[i]] = nsend;
-        nsend += atc->count[commnode[i]];
+        nsend                 += atc->count[commnode[i]];
     }
-    if (bX) {
+    if (bX)
+    {
         if (atc->count[atc->nodeid] + nsend != n)
-            gmx_fatal(FARGS,"%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+        {
+            gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
                       "This usually means that your system is not well equilibrated.",
                       n - (atc->count[atc->nodeid] + nsend),
-                      pme->nodeid,'x'+atc->dimind);
+                      pme->nodeid, 'x'+atc->dimind);
+        }
 
-        if (nsend > pme->buf_nalloc) {
+        if (nsend > pme->buf_nalloc)
+        {
             pme->buf_nalloc = over_alloc_dd(nsend);
-            srenew(pme->bufv,pme->buf_nalloc);
-            srenew(pme->bufr,pme->buf_nalloc);
+            srenew(pme->bufv, pme->buf_nalloc);
+            srenew(pme->bufr, pme->buf_nalloc);
         }
 
         atc->n = atc->count[atc->nodeid];
-        for(i=0; i<nnodes_comm; i++) {
+        for (i = 0; i < nnodes_comm; i++)
+        {
             scount = atc->count[commnode[i]];
             /* Communicate the count */
             if (debug)
-                fprintf(debug,"dimind %d PME node %d send to node %d: %d\n",
-                        atc->dimind,atc->nodeid,commnode[i],scount);
-            pme_dd_sendrecv(atc,FALSE,i,
-                            &scount,sizeof(int),
-                            &atc->rcount[i],sizeof(int));
+            {
+                fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+                        atc->dimind, atc->nodeid, commnode[i], scount);
+            }
+            pme_dd_sendrecv(atc, FALSE, i,
+                            &scount, sizeof(int),
+                            &atc->rcount[i], sizeof(int));
             atc->n += atc->rcount[i];
         }
 
@@ -815,19 +854,25 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
     }
 
     local_pos = 0;
-    for(i=0; i<n; i++) {
+    for (i = 0; i < n; i++)
+    {
         node = atc->pd[i];
-        if (node == atc->nodeid) {
+        if (node == atc->nodeid)
+        {
             /* Copy direct to the receive buffer */
-            if (bX) {
-                copy_rvec(x[i],atc->x[local_pos]);
+            if (bX)
+            {
+                copy_rvec(x[i], atc->x[local_pos]);
             }
             atc->q[local_pos] = charge[i];
             local_pos++;
-        } else {
+        }
+        else
+        {
             /* Copy to the send buffer */
-            if (bX) {
-                copy_rvec(x[i],pme->bufv[buf_index[node]]);
+            if (bX)
+            {
+                copy_rvec(x[i], pme->bufv[buf_index[node]]);
             }
             pme->bufr[buf_index[node]] = charge[i];
             buf_index[node]++;
@@ -835,20 +880,23 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
     }
 
     buf_pos = 0;
-    for(i=0; i<nnodes_comm; i++) {
+    for (i = 0; i < nnodes_comm; i++)
+    {
         scount = atc->count[commnode[i]];
         rcount = atc->rcount[i];
-        if (scount > 0 || rcount > 0) {
-            if (bX) {
+        if (scount > 0 || rcount > 0)
+        {
+            if (bX)
+            {
                 /* Communicate the coordinates */
-                pme_dd_sendrecv(atc,FALSE,i,
-                                pme->bufv[buf_pos],scount*sizeof(rvec),
-                                atc->x[local_pos],rcount*sizeof(rvec));
+                pme_dd_sendrecv(atc, FALSE, i,
+                                pme->bufv[buf_pos], scount*sizeof(rvec),
+                                atc->x[local_pos], rcount*sizeof(rvec));
             }
             /* Communicate the charges */
-            pme_dd_sendrecv(atc,FALSE,i,
-                            pme->bufr+buf_pos,scount*sizeof(real),
-                            atc->q+local_pos,rcount*sizeof(real));
+            pme_dd_sendrecv(atc, FALSE, i,
+                            pme->bufr+buf_pos, scount*sizeof(real),
+                            atc->q+local_pos, rcount*sizeof(real));
             buf_pos   += scount;
             local_pos += atc->rcount[i];
         }
@@ -859,65 +907,67 @@ static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
                            int n, rvec *f,
                            gmx_bool bAddF)
 {
-  int *commnode,*buf_index;
-  int nnodes_comm,local_pos,buf_pos,i,scount,rcount,node;
-
-  commnode  = atc->node_dest;
-  buf_index = atc->buf_index;
-
-  nnodes_comm = min(2*atc->maxshift,atc->nslab-1);
-
-  local_pos = atc->count[atc->nodeid];
-  buf_pos = 0;
-  for(i=0; i<nnodes_comm; i++) {
-    scount = atc->rcount[i];
-    rcount = atc->count[commnode[i]];
-    if (scount > 0 || rcount > 0) {
-      /* Communicate the forces */
-      pme_dd_sendrecv(atc,TRUE,i,
-                      atc->f[local_pos],scount*sizeof(rvec),
-                      pme->bufv[buf_pos],rcount*sizeof(rvec));
-      local_pos += scount;
-    }
-    buf_index[commnode[i]] = buf_pos;
-    buf_pos   += rcount;
-  }
+    int *commnode, *buf_index;
+    int  nnodes_comm, local_pos, buf_pos, i, scount, rcount, node;
+
+    commnode  = atc->node_dest;
+    buf_index = atc->buf_index;
+
+    nnodes_comm = min(2*atc->maxshift, atc->nslab-1);
+
+    local_pos = atc->count[atc->nodeid];
+    buf_pos   = 0;
+    for (i = 0; i < nnodes_comm; i++)
+    {
+        scount = atc->rcount[i];
+        rcount = atc->count[commnode[i]];
+        if (scount > 0 || rcount > 0)
+        {
+            /* Communicate the forces */
+            pme_dd_sendrecv(atc, TRUE, i,
+                            atc->f[local_pos], scount*sizeof(rvec),
+                            pme->bufv[buf_pos], rcount*sizeof(rvec));
+            local_pos += scount;
+        }
+        buf_index[commnode[i]] = buf_pos;
+        buf_pos               += rcount;
+    }
 
     local_pos = 0;
     if (bAddF)
     {
-        for(i=0; i<n; i++)
+        for (i = 0; i < n; i++)
         {
             node = atc->pd[i];
             if (node == atc->nodeid)
             {
                 /* Add from the local force array */
-                rvec_inc(f[i],atc->f[local_pos]);
+                rvec_inc(f[i], atc->f[local_pos]);
                 local_pos++;
             }
             else
             {
                 /* Add from the receive buffer */
-                rvec_inc(f[i],pme->bufv[buf_index[node]]);
+                rvec_inc(f[i], pme->bufv[buf_index[node]]);
                 buf_index[node]++;
             }
         }
     }
     else
     {
-        for(i=0; i<n; i++)
+        for (i = 0; i < n; i++)
         {
             node = atc->pd[i];
             if (node == atc->nodeid)
             {
                 /* Copy from the local force array */
-                copy_rvec(atc->f[local_pos],f[i]);
+                copy_rvec(atc->f[local_pos], f[i]);
                 local_pos++;
             }
             else
             {
                 /* Copy from the receive buffer */
-                copy_rvec(pme->bufv[buf_index[node]],f[i]);
+                copy_rvec(pme->bufv[buf_index[node]], f[i]);
                 buf_index[node]++;
             }
         }
@@ -929,26 +979,26 @@ static void
 gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 {
     pme_overlap_t *overlap;
-    int send_index0,send_nindex;
-    int recv_index0,recv_nindex;
-    MPI_Status stat;
-    int i,j,k,ix,iy,iz,icnt;
-    int ipulse,send_id,recv_id,datasize;
-    real *p;
-    real *sendptr,*recvptr;
+    int            send_index0, send_nindex;
+    int            recv_index0, recv_nindex;
+    MPI_Status     stat;
+    int            i, j, k, ix, iy, iz, icnt;
+    int            ipulse, send_id, recv_id, datasize;
+    real          *p;
+    real          *sendptr, *recvptr;
 
     /* Start with minor-rank communication. This is a bit of a pain since it is not contiguous */
     overlap = &pme->overlap[1];
 
-    for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
+    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
     {
         /* Since we have already (un)wrapped the overlap in the z-dimension,
          * we only have to communicate 0 to nkz (not pmegrid_nz).
          */
-        if (direction==GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_QGRID_FORWARD)
         {
-            send_id = overlap->send_id[ipulse];
-            recv_id = overlap->recv_id[ipulse];
+            send_id       = overlap->send_id[ipulse];
+            recv_id       = overlap->recv_id[ipulse];
             send_index0   = overlap->comm_data[ipulse].send_index0;
             send_nindex   = overlap->comm_data[ipulse].send_nindex;
             recv_index0   = overlap->comm_data[ipulse].recv_index0;
@@ -956,8 +1006,8 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         }
         else
         {
-            send_id = overlap->recv_id[ipulse];
-            recv_id = overlap->send_id[ipulse];
+            send_id       = overlap->recv_id[ipulse];
+            recv_id       = overlap->send_id[ipulse];
             send_index0   = overlap->comm_data[ipulse].recv_index0;
             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
             recv_index0   = overlap->comm_data[ipulse].send_index0;
@@ -967,20 +1017,20 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Copy data to contiguous send buffer */
         if (debug)
         {
-            fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
-                    pme->nodeid,overlap->nodeid,send_id,
+            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+                    pme->nodeid, overlap->nodeid, send_id,
                     pme->pmegrid_start_iy,
                     send_index0-pme->pmegrid_start_iy,
                     send_index0-pme->pmegrid_start_iy+send_nindex);
         }
         icnt = 0;
-        for(i=0;i<pme->pmegrid_nx;i++)
+        for (i = 0; i < pme->pmegrid_nx; i++)
         {
             ix = i;
-            for(j=0;j<send_nindex;j++)
+            for (j = 0; j < send_nindex; j++)
             {
                 iy = j + send_index0 - pme->pmegrid_start_iy;
-                for(k=0;k<pme->nkz;k++)
+                for (k = 0; k < pme->nkz; k++)
                 {
                     iz = k;
                     overlap->sendbuf[icnt++] = grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz];
@@ -990,32 +1040,32 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 
         datasize      = pme->pmegrid_nx * pme->nkz;
 
-        MPI_Sendrecv(overlap->sendbuf,send_nindex*datasize,GMX_MPI_REAL,
-                     send_id,ipulse,
-                     overlap->recvbuf,recv_nindex*datasize,GMX_MPI_REAL,
-                     recv_id,ipulse,
-                     overlap->mpi_comm,&stat);
+        MPI_Sendrecv(overlap->sendbuf, send_nindex*datasize, GMX_MPI_REAL,
+                     send_id, ipulse,
+                     overlap->recvbuf, recv_nindex*datasize, GMX_MPI_REAL,
+                     recv_id, ipulse,
+                     overlap->mpi_comm, &stat);
 
         /* Get data from contiguous recv buffer */
         if (debug)
         {
-            fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
-                    pme->nodeid,overlap->nodeid,recv_id,
+            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+                    pme->nodeid, overlap->nodeid, recv_id,
                     pme->pmegrid_start_iy,
                     recv_index0-pme->pmegrid_start_iy,
                     recv_index0-pme->pmegrid_start_iy+recv_nindex);
         }
         icnt = 0;
-        for(i=0;i<pme->pmegrid_nx;i++)
+        for (i = 0; i < pme->pmegrid_nx; i++)
         {
             ix = i;
-            for(j=0;j<recv_nindex;j++)
+            for (j = 0; j < recv_nindex; j++)
             {
                 iy = j + recv_index0 - pme->pmegrid_start_iy;
-                for(k=0;k<pme->nkz;k++)
+                for (k = 0; k < pme->nkz; k++)
                 {
                     iz = k;
-                    if(direction==GMX_SUM_QGRID_FORWARD)
+                    if (direction == GMX_SUM_QGRID_FORWARD)
                     {
                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
                     }
@@ -1035,27 +1085,27 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
      */
     overlap = &pme->overlap[0];
 
-    for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
+    for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
     {
-        if(direction==GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_QGRID_FORWARD)
         {
-            send_id = overlap->send_id[ipulse];
-            recv_id = overlap->recv_id[ipulse];
+            send_id       = overlap->send_id[ipulse];
+            recv_id       = overlap->recv_id[ipulse];
             send_index0   = overlap->comm_data[ipulse].send_index0;
             send_nindex   = overlap->comm_data[ipulse].send_nindex;
             recv_index0   = overlap->comm_data[ipulse].recv_index0;
             recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
-            recvptr   = overlap->recvbuf;
+            recvptr       = overlap->recvbuf;
         }
         else
         {
-            send_id = overlap->recv_id[ipulse];
-            recv_id = overlap->send_id[ipulse];
+            send_id       = overlap->recv_id[ipulse];
+            recv_id       = overlap->send_id[ipulse];
             send_index0   = overlap->comm_data[ipulse].recv_index0;
             send_nindex   = overlap->comm_data[ipulse].recv_nindex;
             recv_index0   = overlap->comm_data[ipulse].send_index0;
             recv_nindex   = overlap->comm_data[ipulse].send_nindex;
-            recvptr   = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
+            recvptr       = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
         }
 
         sendptr       = grid + (send_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
@@ -1063,29 +1113,29 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 
         if (debug)
         {
-            fprintf(debug,"PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
-                    pme->nodeid,overlap->nodeid,send_id,
+            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+                    pme->nodeid, overlap->nodeid, send_id,
                     pme->pmegrid_start_ix,
                     send_index0-pme->pmegrid_start_ix,
                     send_index0-pme->pmegrid_start_ix+send_nindex);
-            fprintf(debug,"PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
-                    pme->nodeid,overlap->nodeid,recv_id,
+            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+                    pme->nodeid, overlap->nodeid, recv_id,
                     pme->pmegrid_start_ix,
                     recv_index0-pme->pmegrid_start_ix,
                     recv_index0-pme->pmegrid_start_ix+recv_nindex);
         }
 
-        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
-                     send_id,ipulse,
-                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
-                     recv_id,ipulse,
-                     overlap->mpi_comm,&stat);
+        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
+                     send_id, ipulse,
+                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
+                     recv_id, ipulse,
+                     overlap->mpi_comm, &stat);
 
         /* ADD data from contiguous recv buffer */
-        if(direction==GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_QGRID_FORWARD)
         {
             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
-            for(i=0;i<recv_nindex*datasize;i++)
+            for (i = 0; i < recv_nindex*datasize; i++)
             {
                 p[i] += overlap->recvbuf[i];
             }
@@ -1098,10 +1148,10 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 static int
 copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
 {
-    ivec    local_fft_ndata,local_fft_offset,local_fft_size;
+    ivec    local_fft_ndata, local_fft_offset, local_fft_size;
     ivec    local_pme_size;
-    int     i,ix,iy,iz;
-    int     pmeidx,fftidx;
+    int     i, ix, iy, iz;
+    int     pmeidx, fftidx;
 
     /* Dimensions should be identical for A/B grid, so we just use A here */
     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
@@ -1114,48 +1164,52 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
     local_pme_size[2] = pme->pmegrid_nz;
 
     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
-     the offset is identical, and the PME grid always has more data (due to overlap)
+       the offset is identical, and the PME grid always has more data (due to overlap)
      */
     {
 #ifdef DEBUG_PME
-        FILE *fp,*fp2;
-        char fn[STRLEN],format[STRLEN];
-        real val;
-        sprintf(fn,"pmegrid%d.pdb",pme->nodeid);
-        fp = ffopen(fn,"w");
-        sprintf(fn,"pmegrid%d.txt",pme->nodeid);
-        fp2 = ffopen(fn,"w");
-     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
+        FILE *fp, *fp2;
+        char  fn[STRLEN], format[STRLEN];
+        real  val;
+        sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
+        fp = ffopen(fn, "w");
+        sprintf(fn, "pmegrid%d.txt", pme->nodeid);
+        fp2 = ffopen(fn, "w");
+        sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
 #endif
 
-    for(ix=0;ix<local_fft_ndata[XX];ix++)
-    {
-        for(iy=0;iy<local_fft_ndata[YY];iy++)
+        for (ix = 0; ix < local_fft_ndata[XX]; ix++)
         {
-            for(iz=0;iz<local_fft_ndata[ZZ];iz++)
+            for (iy = 0; iy < local_fft_ndata[YY]; iy++)
             {
-                pmeidx = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
-                fftidx = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
-                fftgrid[fftidx] = pmegrid[pmeidx];
+                for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
+                {
+                    pmeidx          = ix*(local_pme_size[YY]*local_pme_size[ZZ])+iy*(local_pme_size[ZZ])+iz;
+                    fftidx          = ix*(local_fft_size[YY]*local_fft_size[ZZ])+iy*(local_fft_size[ZZ])+iz;
+                    fftgrid[fftidx] = pmegrid[pmeidx];
 #ifdef DEBUG_PME
-                val = 100*pmegrid[pmeidx];
-                if (pmegrid[pmeidx] != 0)
-                fprintf(fp,format,"ATOM",pmeidx,"CA","GLY",' ',pmeidx,' ',
-                        5.0*ix,5.0*iy,5.0*iz,1.0,val);
-                if (pmegrid[pmeidx] != 0)
-                    fprintf(fp2,"%-12s  %5d  %5d  %5d  %12.5e\n",
-                            "qgrid",
-                            pme->pmegrid_start_ix + ix,
-                            pme->pmegrid_start_iy + iy,
-                            pme->pmegrid_start_iz + iz,
-                            pmegrid[pmeidx]);
+                    val = 100*pmegrid[pmeidx];
+                    if (pmegrid[pmeidx] != 0)
+                    {
+                        fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
+                                5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+                    }
+                    if (pmegrid[pmeidx] != 0)
+                    {
+                        fprintf(fp2, "%-12s  %5d  %5d  %5d  %12.5e\n",
+                                "qgrid",
+                                pme->pmegrid_start_ix + ix,
+                                pme->pmegrid_start_iy + iy,
+                                pme->pmegrid_start_iz + iz,
+                                pmegrid[pmeidx]);
+                    }
 #endif
+                }
             }
         }
-    }
 #ifdef DEBUG_PME
-    ffclose(fp);
-    ffclose(fp2);
+        ffclose(fp);
+        ffclose(fp2);
 #endif
     }
     return 0;
@@ -1175,16 +1229,16 @@ static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 
 static int
 copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
-                        int nthread,int thread)
+                        int nthread, int thread)
 {
-    ivec    local_fft_ndata,local_fft_offset,local_fft_size;
-    ivec    local_pme_size;
-    int     ixy0,ixy1,ixy,ix,iy,iz;
-    int     pmeidx,fftidx;
+    ivec          local_fft_ndata, local_fft_offset, local_fft_size;
+    ivec          local_pme_size;
+    int           ixy0, ixy1, ixy, ix, iy, iz;
+    int           pmeidx, fftidx;
 #ifdef PME_TIME_THREADS
-    gmx_cycles_t c1;
-    static double cs1=0;
-    static int cnt=0;
+    gmx_cycles_t  c1;
+    static double cs1 = 0;
+    static int    cnt = 0;
 #endif
 
 #ifdef PME_TIME_THREADS
@@ -1201,31 +1255,31 @@ copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
     local_pme_size[2] = pme->pmegrid_nz;
 
     /* The fftgrid is always 'justified' to the lower-left corner of the PME grid,
-     the offset is identical, and the PME grid always has more data (due to overlap)
+       the offset is identical, and the PME grid always has more data (due to overlap)
      */
     ixy0 = ((thread  )*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
     ixy1 = ((thread+1)*local_fft_ndata[XX]*local_fft_ndata[YY])/nthread;
 
-    for(ixy=ixy0;ixy<ixy1;ixy++)
+    for (ixy = ixy0; ixy < ixy1; ixy++)
     {
         ix = ixy/local_fft_ndata[YY];
         iy = ixy - ix*local_fft_ndata[YY];
 
         pmeidx = (ix*local_pme_size[YY] + iy)*local_pme_size[ZZ];
         fftidx = (ix*local_fft_size[YY] + iy)*local_fft_size[ZZ];
-        for(iz=0;iz<local_fft_ndata[ZZ];iz++)
+        for (iz = 0; iz < local_fft_ndata[ZZ]; iz++)
         {
             pmegrid[pmeidx+iz] = fftgrid[fftidx+iz];
         }
     }
 
 #ifdef PME_TIME_THREADS
-    c1 = omp_cyc_end(c1);
+    c1   = omp_cyc_end(c1);
     cs1 += (double)c1;
     cnt++;
     if (cnt % 20 == 0)
     {
-        printf("copy %.2f\n",cs1*1e-9);
+        printf("copy %.2f\n", cs1*1e-9);
     }
 #endif
 
@@ -1236,7 +1290,7 @@ copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
 static void
 wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 {
-    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix,iy,iz;
+    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
 
     nx = pme->nkx;
     ny = pme->nky;
@@ -1249,11 +1303,11 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     overlap = pme->pme_order - 1;
 
     /* Add periodic overlap in z */
-    for(ix=0; ix<pme->pmegrid_nx; ix++)
+    for (ix = 0; ix < pme->pmegrid_nx; ix++)
     {
-        for(iy=0; iy<pme->pmegrid_ny; iy++)
+        for (iy = 0; iy < pme->pmegrid_ny; iy++)
         {
-            for(iz=0; iz<overlap; iz++)
+            for (iz = 0; iz < overlap; iz++)
             {
                 pmegrid[(ix*pny+iy)*pnz+iz] +=
                     pmegrid[(ix*pny+iy)*pnz+nz+iz];
@@ -1263,28 +1317,28 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 
     if (pme->nnodes_minor == 1)
     {
-       for(ix=0; ix<pme->pmegrid_nx; ix++)
-       {
-           for(iy=0; iy<overlap; iy++)
-           {
-               for(iz=0; iz<nz; iz++)
-               {
-                   pmegrid[(ix*pny+iy)*pnz+iz] +=
-                       pmegrid[(ix*pny+ny+iy)*pnz+iz];
-               }
-           }
-       }
+        for (ix = 0; ix < pme->pmegrid_nx; ix++)
+        {
+            for (iy = 0; iy < overlap; iy++)
+            {
+                for (iz = 0; iz < nz; iz++)
+                {
+                    pmegrid[(ix*pny+iy)*pnz+iz] +=
+                        pmegrid[(ix*pny+ny+iy)*pnz+iz];
+                }
+            }
+        }
     }
 
     if (pme->nnodes_major == 1)
     {
         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 
-        for(ix=0; ix<overlap; ix++)
+        for (ix = 0; ix < overlap; ix++)
         {
-            for(iy=0; iy<ny_x; iy++)
+            for (iy = 0; iy < ny_x; iy++)
             {
-                for(iz=0; iz<nz; iz++)
+                for (iz = 0; iz < nz; iz++)
                 {
                     pmegrid[(ix*pny+iy)*pnz+iz] +=
                         pmegrid[((nx+ix)*pny+iy)*pnz+iz];
@@ -1298,7 +1352,7 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 static void
 unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 {
-    int     nx,ny,nz,pnx,pny,pnz,ny_x,overlap,ix;
+    int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
 
     nx = pme->nkx;
     ny = pme->nky;
@@ -1314,13 +1368,13 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     {
         ny_x = (pme->nnodes_minor == 1 ? ny : pme->pmegrid_ny);
 
-        for(ix=0; ix<overlap; ix++)
+        for (ix = 0; ix < overlap; ix++)
         {
-            int iy,iz;
+            int iy, iz;
 
-            for(iy=0; iy<ny_x; iy++)
+            for (iy = 0; iy < ny_x; iy++)
             {
-                for(iz=0; iz<nz; iz++)
+                for (iz = 0; iz < nz; iz++)
                 {
                     pmegrid[((nx+ix)*pny+iy)*pnz+iz] =
                         pmegrid[(ix*pny+iy)*pnz+iz];
@@ -1332,30 +1386,30 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     if (pme->nnodes_minor == 1)
     {
 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
-       for(ix=0; ix<pme->pmegrid_nx; ix++)
-       {
-           int iy,iz;
+        for (ix = 0; ix < pme->pmegrid_nx; ix++)
+        {
+            int iy, iz;
 
-           for(iy=0; iy<overlap; iy++)
-           {
-               for(iz=0; iz<nz; iz++)
-               {
-                   pmegrid[(ix*pny+ny+iy)*pnz+iz] =
-                       pmegrid[(ix*pny+iy)*pnz+iz];
-               }
-           }
-       }
+            for (iy = 0; iy < overlap; iy++)
+            {
+                for (iz = 0; iz < nz; iz++)
+                {
+                    pmegrid[(ix*pny+ny+iy)*pnz+iz] =
+                        pmegrid[(ix*pny+iy)*pnz+iz];
+                }
+            }
+        }
     }
 
     /* Copy periodic overlap in z */
 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
-    for(ix=0; ix<pme->pmegrid_nx; ix++)
+    for (ix = 0; ix < pme->pmegrid_nx; ix++)
     {
-        int iy,iz;
+        int iy, iz;
 
-        for(iy=0; iy<pme->pmegrid_ny; iy++)
+        for (iy = 0; iy < pme->pmegrid_ny; iy++)
         {
-            for(iz=0; iz<overlap; iz++)
+            for (iz = 0; iz < overlap; iz++)
             {
                 pmegrid[(ix*pny+iy)*pnz+nz+iz] =
                     pmegrid[(ix*pny+iy)*pnz+iz];
@@ -1364,37 +1418,37 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     }
 }
 
-static void clear_grid(int nx,int ny,int nz,real *grid,
-                       ivec fs,int *flag,
-                       int fx,int fy,int fz,
+static void clear_grid(int nx, int ny, int nz, real *grid,
+                       ivec fs, int *flag,
+                       int fx, int fy, int fz,
                        int order)
 {
-    int nc,ncz;
-    int fsx,fsy,fsz,gx,gy,gz,g0x,g0y,x,y,z;
+    int nc, ncz;
+    int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
     int flind;
 
     nc  = 2 + (order - 2)/FLBS;
     ncz = 2 + (order - 2)/FLBSZ;
 
-    for(fsx=fx; fsx<fx+nc; fsx++)
+    for (fsx = fx; fsx < fx+nc; fsx++)
     {
-        for(fsy=fy; fsy<fy+nc; fsy++)
+        for (fsy = fy; fsy < fy+nc; fsy++)
         {
-            for(fsz=fz; fsz<fz+ncz; fsz++)
+            for (fsz = fz; fsz < fz+ncz; fsz++)
             {
                 flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
                 if (flag[flind] == 0)
                 {
-                    gx = fsx*FLBS;
-                    gy = fsy*FLBS;
-                    gz = fsz*FLBSZ;
+                    gx  = fsx*FLBS;
+                    gy  = fsy*FLBS;
+                    gz  = fsz*FLBSZ;
                     g0x = (gx*ny + gy)*nz + gz;
-                    for(x=0; x<FLBS; x++)
+                    for (x = 0; x < FLBS; x++)
                     {
                         g0y = g0x;
-                        for(y=0; y<FLBS; y++)
+                        for (y = 0; y < FLBS; y++)
                         {
-                            for(z=0; z<FLBSZ; z++)
+                            for (z = 0; z < FLBSZ; z++)
                             {
                                 grid[g0y+z] = 0;
                             }
@@ -1412,23 +1466,23 @@ static void clear_grid(int nx,int ny,int nz,real *grid,
 
 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 #define DO_BSPLINE(order)                            \
-for(ithx=0; (ithx<order); ithx++)                    \
-{                                                    \
-    index_x = (i0+ithx)*pny*pnz;                     \
-    valx    = qn*thx[ithx];                          \
+    for (ithx = 0; (ithx < order); ithx++)                    \
+    {                                                    \
+        index_x = (i0+ithx)*pny*pnz;                     \
+        valx    = qn*thx[ithx];                          \
                                                      \
-    for(ithy=0; (ithy<order); ithy++)                \
-    {                                                \
-        valxy    = valx*thy[ithy];                   \
-        index_xy = index_x+(j0+ithy)*pnz;            \
+        for (ithy = 0; (ithy < order); ithy++)                \
+        {                                                \
+            valxy    = valx*thy[ithy];                   \
+            index_xy = index_x+(j0+ithy)*pnz;            \
                                                      \
-        for(ithz=0; (ithz<order); ithz++)            \
-        {                                            \
-            index_xyz        = index_xy+(k0+ithz);   \
-            grid[index_xyz] += valxy*thz[ithz];      \
-        }                                            \
-    }                                                \
-}
+            for (ithz = 0; (ithz < order); ithz++)            \
+            {                                            \
+                index_xyz        = index_xy+(k0+ithz);   \
+                grid[index_xyz] += valxy*thz[ithz];      \
+            }                                            \
+        }                                                \
+    }
 
 
 static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
@@ -1437,16 +1491,16 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 {
 
     /* spread charges from home atoms to local grid */
-    real     *grid;
+    real          *grid;
     pme_overlap_t *ol;
-    int      b,i,nn,n,ithx,ithy,ithz,i0,j0,k0;
-    int *    idxptr;
-    int      order,norder,index_x,index_xy,index_xyz;
-    real     valx,valxy,qn;
-    real     *thx,*thy,*thz;
-    int      localsize, bndsize;
-    int      pnx,pny,pnz,ndatatot;
-    int      offx,offy,offz;
+    int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
+    int       *    idxptr;
+    int            order, norder, index_x, index_xy, index_xyz;
+    real           valx, valxy, qn;
+    real          *thx, *thy, *thz;
+    int            localsize, bndsize;
+    int            pnx, pny, pnz, ndatatot;
+    int            offx, offy, offz;
 
     pnx = pmegrid->s[XX];
     pny = pmegrid->s[YY];
@@ -1457,15 +1511,15 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
     offz = pmegrid->offset[ZZ];
 
     ndatatot = pnx*pny*pnz;
-    grid = pmegrid->grid;
-    for(i=0;i<ndatatot;i++)
+    grid     = pmegrid->grid;
+    for (i = 0; i < ndatatot; i++)
     {
         grid[i] = 0;
     }
-    
+
     order = pmegrid->order;
 
-    for(nn=0; nn<spline->n; nn++)
+    for (nn = 0; nn < spline->n; nn++)
     {
         n  = spline->ind[nn];
         qn = atc->q[n];
@@ -1483,8 +1537,9 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
             thy = spline->theta[YY] + norder;
             thz = spline->theta[ZZ] + norder;
 
-            switch (order) {
-            case 4:
+            switch (order)
+            {
+                case 4:
 #ifdef PME_SSE
 #ifdef PME_SSE_UNALIGNED
 #define PME_SPREAD_SSE_ORDER4
@@ -1494,27 +1549,27 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 #endif
 #include "pme_sse_single.h"
 #else
-                DO_BSPLINE(4);
+                    DO_BSPLINE(4);
 #endif
-                break;
-            case 5:
+                    break;
+                case 5:
 #ifdef PME_SSE
 #define PME_SPREAD_SSE_ALIGNED
 #define PME_ORDER 5
 #include "pme_sse_single.h"
 #else
-                DO_BSPLINE(5);
+                    DO_BSPLINE(5);
 #endif
-                break;
-            default:
-                DO_BSPLINE(order);
-                break;
+                    break;
+                default:
+                    DO_BSPLINE(order);
+                    break;
             }
         }
     }
 }
 
-static void set_grid_alignment(int *pmegrid_nz,int pme_order)
+static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 {
 #ifdef PME_SSE
     if (pme_order == 5
@@ -1529,7 +1584,7 @@ static void set_grid_alignment(int *pmegrid_nz,int pme_order)
 #endif
 }
 
-static void set_gridsize_alignment(int *gridsize,int pme_order)
+static void set_gridsize_alignment(int *gridsize, int pme_order)
 {
 #ifdef PME_SSE
 #ifndef PME_SSE_UNALIGNED
@@ -1540,7 +1595,7 @@ static void set_gridsize_alignment(int *gridsize,int pme_order)
          * Note that for pme_order=5, the pme grid z-size alignment
          * ensures that we will not go beyond the grid size.
          */
-         *gridsize += 4;
+        *gridsize += 4;
     }
 #endif
 #endif
@@ -1554,21 +1609,21 @@ static void pmegrid_init(pmegrid_t *grid,
                          int pme_order,
                          real *ptr)
 {
-    int nz,gridsize;
+    int nz, gridsize;
 
-    grid->ci[XX] = cx;
-    grid->ci[YY] = cy;
-    grid->ci[ZZ] = cz;
+    grid->ci[XX]     = cx;
+    grid->ci[YY]     = cy;
+    grid->ci[ZZ]     = cz;
     grid->offset[XX] = x0;
     grid->offset[YY] = y0;
     grid->offset[ZZ] = z0;
     grid->n[XX]      = x1 - x0 + pme_order - 1;
     grid->n[YY]      = y1 - y0 + pme_order - 1;
     grid->n[ZZ]      = z1 - z0 + pme_order - 1;
-    copy_ivec(grid->n,grid->s);
+    copy_ivec(grid->n, grid->s);
 
     nz = grid->s[ZZ];
-    set_grid_alignment(&nz,pme_order);
+    set_grid_alignment(&nz, pme_order);
     if (set_alignment)
     {
         grid->s[ZZ] = nz;
@@ -1582,8 +1637,8 @@ static void pmegrid_init(pmegrid_t *grid,
     if (ptr == NULL)
     {
         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
-        set_gridsize_alignment(&gridsize,pme_order);
-        snew_aligned(grid->grid,gridsize,16);
+        set_gridsize_alignment(&gridsize, pme_order);
+        snew_aligned(grid->grid, gridsize, 16);
     }
     else
     {
@@ -1591,24 +1646,24 @@ static void pmegrid_init(pmegrid_t *grid,
     }
 }
 
-static int div_round_up(int enumerator,int denominator)
+static int div_round_up(int enumerator, int denominator)
 {
     return (enumerator + denominator - 1)/denominator;
 }
 
-static void make_subgrid_division(const ivec n,int ovl,int nthread,
+static void make_subgrid_division(const ivec n, int ovl, int nthread,
                                   ivec nsub)
 {
-    int gsize_opt,gsize;
-    int nsx,nsy,nsz;
+    int gsize_opt, gsize;
+    int nsx, nsy, nsz;
     char *env;
 
     gsize_opt = -1;
-    for(nsx=1; nsx<=nthread; nsx++)
+    for (nsx = 1; nsx <= nthread; nsx++)
     {
         if (nthread % nsx == 0)
         {
-            for(nsy=1; nsy<=nthread; nsy++)
+            for (nsy = 1; nsy <= nthread; nsy++)
             {
                 if (nsx*nsy <= nthread && nthread % (nsx*nsy) == 0)
                 {
@@ -1616,9 +1671,9 @@ static void make_subgrid_division(const ivec n,int ovl,int nthread,
 
                     /* Determine the number of grid points per thread */
                     gsize =
-                        (div_round_up(n[XX],nsx) + ovl)*
-                        (div_round_up(n[YY],nsy) + ovl)*
-                        (div_round_up(n[ZZ],nsz) + ovl);
+                        (div_round_up(n[XX], nsx) + ovl)*
+                        (div_round_up(n[YY], nsy) + ovl)*
+                        (div_round_up(n[ZZ], nsz) + ovl);
 
                     /* Minimize the number of grids points per thread
                      * and, secondarily, the number of cuts in minor dimensions.
@@ -1628,9 +1683,9 @@ static void make_subgrid_division(const ivec n,int ovl,int nthread,
                         (gsize == gsize_opt &&
                          (nsz < nsub[ZZ] || (nsz == nsub[ZZ] && nsy < nsub[YY]))))
                     {
-                        nsub[XX] = nsx;
-                        nsub[YY] = nsy;
-                        nsub[ZZ] = nsz;
+                        nsub[XX]  = nsx;
+                        nsub[YY]  = nsy;
+                        nsub[ZZ]  = nsz;
                         gsize_opt = gsize;
                     }
                 }
@@ -1641,76 +1696,76 @@ static void make_subgrid_division(const ivec n,int ovl,int nthread,
     env = getenv("GMX_PME_THREAD_DIVISION");
     if (env != NULL)
     {
-        sscanf(env,"%d %d %d",&nsub[XX],&nsub[YY],&nsub[ZZ]);
+        sscanf(env, "%d %d %d", &nsub[XX], &nsub[YY], &nsub[ZZ]);
     }
 
     if (nsub[XX]*nsub[YY]*nsub[ZZ] != nthread)
     {
-        gmx_fatal(FARGS,"PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)",nsub[XX],nsub[YY],nsub[ZZ],nthread);
+        gmx_fatal(FARGS, "PME grid thread division (%d x %d x %d) does not match the total number of threads (%d)", nsub[XX], nsub[YY], nsub[ZZ], nthread);
     }
 }
 
 static void pmegrids_init(pmegrids_t *grids,
-                          int nx,int ny,int nz,int nz_base,
+                          int nx, int ny, int nz, int nz_base,
                           int pme_order,
                           int nthread,
                           int overlap_x,
                           int overlap_y)
 {
-    ivec n,n_base,g0,g1;
-    int t,x,y,z,d,i,tfac;
-    int max_comm_lines=-1;
+    ivec n, n_base, g0, g1;
+    int t, x, y, z, d, i, tfac;
+    int max_comm_lines = -1;
 
     n[XX] = nx - (pme_order - 1);
     n[YY] = ny - (pme_order - 1);
     n[ZZ] = nz - (pme_order - 1);
 
-    copy_ivec(n,n_base);
+    copy_ivec(n, n_base);
     n_base[ZZ] = nz_base;
 
-    pmegrid_init(&grids->grid,0,0,0,0,0,0,n[XX],n[YY],n[ZZ],FALSE,pme_order,
+    pmegrid_init(&grids->grid, 0, 0, 0, 0, 0, 0, n[XX], n[YY], n[ZZ], FALSE, pme_order,
                  NULL);
 
     grids->nthread = nthread;
 
-    make_subgrid_division(n_base,pme_order-1,grids->nthread,grids->nc);
+    make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
 
     if (grids->nthread > 1)
     {
         ivec nst;
         int gridsize;
 
-        for(d=0; d<DIM; d++)
+        for (d = 0; d < DIM; d++)
         {
-            nst[d] = div_round_up(n[d],grids->nc[d]) + pme_order - 1;
+            nst[d] = div_round_up(n[d], grids->nc[d]) + pme_order - 1;
         }
-        set_grid_alignment(&nst[ZZ],pme_order);
+        set_grid_alignment(&nst[ZZ], pme_order);
 
         if (debug)
         {
-            fprintf(debug,"pmegrid thread local division: %d x %d x %d\n",
-                    grids->nc[XX],grids->nc[YY],grids->nc[ZZ]);
-            fprintf(debug,"pmegrid %d %d %d max thread pmegrid %d %d %d\n",
-                    nx,ny,nz,
-                    nst[XX],nst[YY],nst[ZZ]);
+            fprintf(debug, "pmegrid thread local division: %d x %d x %d\n",
+                    grids->nc[XX], grids->nc[YY], grids->nc[ZZ]);
+            fprintf(debug, "pmegrid %d %d %d max thread pmegrid %d %d %d\n",
+                    nx, ny, nz,
+                    nst[XX], nst[YY], nst[ZZ]);
         }
 
-        snew(grids->grid_th,grids->nthread);
-        t = 0;
+        snew(grids->grid_th, grids->nthread);
+        t        = 0;
         gridsize = nst[XX]*nst[YY]*nst[ZZ];
-        set_gridsize_alignment(&gridsize,pme_order);
+        set_gridsize_alignment(&gridsize, pme_order);
         snew_aligned(grids->grid_all,
                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
                      16);
 
-        for(x=0; x<grids->nc[XX]; x++)
+        for (x = 0; x < grids->nc[XX]; x++)
         {
-            for(y=0; y<grids->nc[YY]; y++)
+            for (y = 0; y < grids->nc[YY]; y++)
             {
-                for(z=0; z<grids->nc[ZZ]; z++)
+                for (z = 0; z < grids->nc[ZZ]; z++)
                 {
                     pmegrid_init(&grids->grid_th[t],
-                                 x,y,z,
+                                 x, y, z,
                                  (n[XX]*(x  ))/grids->nc[XX],
                                  (n[YY]*(y  ))/grids->nc[YY],
                                  (n[ZZ]*(z  ))/grids->nc[ZZ],
@@ -1726,13 +1781,13 @@ static void pmegrids_init(pmegrids_t *grids,
         }
     }
 
-    snew(grids->g2t,DIM);
+    snew(grids->g2t, DIM);
     tfac = 1;
-    for(d=DIM-1; d>=0; d--)
+    for (d = DIM-1; d >= 0; d--)
     {
-        snew(grids->g2t[d],n[d]);
+        snew(grids->g2t[d], n[d]);
         t = 0;
-        for(i=0; i<n[d]; i++)
+        for (i = 0; i < n[d]; i++)
         {
             /* The second check should match the parameters
              * of the pmegrid_init call above.
@@ -1748,9 +1803,9 @@ static void pmegrids_init(pmegrids_t *grids,
 
         switch (d)
         {
-        case XX: max_comm_lines = overlap_x;     break;
-        case YY: max_comm_lines = overlap_y;     break;
-        case ZZ: max_comm_lines = pme_order - 1; break;
+            case XX: max_comm_lines = overlap_x;     break;
+            case YY: max_comm_lines = overlap_y;     break;
+            case ZZ: max_comm_lines = pme_order - 1; break;
         }
         grids->nthread_comm[d] = 0;
         while ((n[d]*grids->nthread_comm[d])/grids->nc[d] < max_comm_lines &&
@@ -1760,15 +1815,15 @@ static void pmegrids_init(pmegrids_t *grids,
         }
         if (debug != NULL)
         {
-            fprintf(debug,"pmegrid thread grid communication range in %c: %d\n",
-                    'x'+d,grids->nthread_comm[d]);
+            fprintf(debug, "pmegrid thread grid communication range in %c: %d\n",
+                    'x'+d, grids->nthread_comm[d]);
         }
         /* It should be possible to make grids->nthread_comm[d]==grids->nc[d]
          * work, but this is not a problematic restriction.
          */
         if (grids->nc[d] > 1 && grids->nthread_comm[d] > grids->nc[d])
         {
-            gmx_fatal(FARGS,"Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME",grids->nthread);
+            gmx_fatal(FARGS, "Too many threads for PME (%d) compared to the number of grid lines, reduce the number of threads doing PME", grids->nthread);
         }
     }
 }
@@ -1784,7 +1839,7 @@ static void pmegrids_destroy(pmegrids_t *grids)
 
         if (grids->nthread > 0)
         {
-            for(t=0; t<grids->nthread; t++)
+            for (t = 0; t < grids->nthread; t++)
             {
                 sfree(grids->grid_th[t].grid);
             }
@@ -1794,25 +1849,25 @@ static void pmegrids_destroy(pmegrids_t *grids)
 }
 
 
-static void realloc_work(pme_work_t *work,int nkx)
+static void realloc_work(pme_work_t *work, int nkx)
 {
     if (nkx > work->nalloc)
     {
         work->nalloc = nkx;
-        srenew(work->mhx  ,work->nalloc);
-        srenew(work->mhy  ,work->nalloc);
-        srenew(work->mhz  ,work->nalloc);
-        srenew(work->m2   ,work->nalloc);
+        srenew(work->mhxwork->nalloc);
+        srenew(work->mhywork->nalloc);
+        srenew(work->mhzwork->nalloc);
+        srenew(work->m2work->nalloc);
         /* Allocate an aligned pointer for SSE operations, including 3 extra
          * elements at the end since SSE operates on 4 elements at a time.
          */
         sfree_aligned(work->denom);
         sfree_aligned(work->tmp1);
         sfree_aligned(work->eterm);
-        snew_aligned(work->denom,work->nalloc+3,16);
-        snew_aligned(work->tmp1 ,work->nalloc+3,16);
-        snew_aligned(work->eterm,work->nalloc+3,16);
-        srenew(work->m2inv,work->nalloc);
+        snew_aligned(work->denom, work->nalloc+3, 16);
+        snew_aligned(work->tmp1, work->nalloc+3, 16);
+        snew_aligned(work->eterm, work->nalloc+3, 16);
+        srenew(work->m2inv, work->nalloc);
     }
 }
 
@@ -1831,26 +1886,26 @@ static void free_work(pme_work_t *work)
 
 
 #ifdef PME_SSE
-    /* Calculate exponentials through SSE in float precision */
+/* Calculate exponentials through SSE in float precision */
 inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 {
     {
-        const __m128 two = _mm_set_ps(2.0f,2.0f,2.0f,2.0f);
+        const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
         __m128 f_sse;
         __m128 lu;
-        __m128 tmp_d1,d_inv,tmp_r,tmp_e;
+        __m128 tmp_d1, d_inv, tmp_r, tmp_e;
         int kx;
         f_sse = _mm_load1_ps(&f);
-        for(kx=0; kx<end; kx+=4)
+        for (kx = 0; kx < end; kx += 4)
         {
             tmp_d1   = _mm_load_ps(d_aligned+kx);
             lu       = _mm_rcp_ps(tmp_d1);
-            d_inv    = _mm_mul_ps(lu,_mm_sub_ps(two,_mm_mul_ps(lu,tmp_d1)));
+            d_inv    = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
             tmp_r    = _mm_load_ps(r_aligned+kx);
             tmp_r    = gmx_mm_exp_ps(tmp_r);
-            tmp_e    = _mm_mul_ps(f_sse,d_inv);
-            tmp_e    = _mm_mul_ps(tmp_e,tmp_r);
-            _mm_store_ps(e_aligned+kx,tmp_e);
+            tmp_e    = _mm_mul_ps(f_sse, d_inv);
+            tmp_e    = _mm_mul_ps(tmp_e, tmp_r);
+            _mm_store_ps(e_aligned+kx, tmp_e);
         }
     }
 }
@@ -1858,15 +1913,15 @@ inline static void calc_exponentials(int start, int end, real f, real *d_aligned
 inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
 {
     int kx;
-    for(kx=start; kx<end; kx++)
+    for (kx = start; kx < end; kx++)
     {
         d[kx] = 1.0/d[kx];
     }
-    for(kx=start; kx<end; kx++)
+    for (kx = start; kx < end; kx++)
     {
         r[kx] = exp(r[kx]);
     }
-    for(kx=start; kx<end; kx++)
+    for (kx = start; kx < end; kx++)
     {
         e[kx] = f*r[kx]*d[kx];
     }
@@ -1874,29 +1929,29 @@ inline static void calc_exponentials(int start, int end, real f, real *d, real *
 #endif
 
 
-static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
-                         real ewaldcoeff,real vol,
+static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
+                         real ewaldcoeff, real vol,
                          gmx_bool bEnerVir,
-                         int nthread,int thread)
+                         int nthread, int thread)
 {
     /* do recip sum over local cells in grid */
     /* y major, z middle, x minor or continuous */
     t_complex *p0;
-    int     kx,ky,kz,maxkx,maxky,maxkz;
-    int     nx,ny,nz,iyz0,iyz1,iyz,iy,iz,kxstart,kxend;
-    real    mx,my,mz;
-    real    factor=M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
-    real    ets2,struct2,vfactor,ets2vf;
-    real    d1,d2,energy=0;
-    real    by,bz;
-    real    virxx=0,virxy=0,virxz=0,viryy=0,viryz=0,virzz=0;
-    real    rxx,ryx,ryy,rzx,rzy,rzz;
+    int     kx, ky, kz, maxkx, maxky, maxkz;
+    int     nx, ny, nz, iyz0, iyz1, iyz, iy, iz, kxstart, kxend;
+    real    mx, my, mz;
+    real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
+    real    ets2, struct2, vfactor, ets2vf;
+    real    d1, d2, energy = 0;
+    real    by, bz;
+    real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
+    real    rxx, ryx, ryy, rzx, rzy, rzz;
     pme_work_t *work;
-    real    *mhx,*mhy,*mhz,*m2,*denom,*tmp1,*eterm,*m2inv;
-    real    mhxk,mhyk,mhzk,m2k;
+    real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *eterm, *m2inv;
+    real    mhxk, mhyk, mhzk, m2k;
     real    corner_fac;
     ivec    complex_order;
-    ivec    local_ndata,local_offset,local_size;
+    ivec    local_ndata, local_offset, local_size;
     real    elfac;
 
     elfac = ONE_4PI_EPS0/pme->epsilon_r;
@@ -1923,7 +1978,7 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
     maxky = (ny+1)/2;
     maxkz = nz/2+1;
 
-    work = &pme->work[thread];
+    work  = &pme->work[thread];
     mhx   = work->mhx;
     mhy   = work->mhy;
     mhz   = work->mhz;
@@ -1936,7 +1991,7 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
     iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
     iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 
-    for(iyz=iyz0; iyz<iyz1; iyz++)
+    for (iyz = iyz0; iyz < iyz1; iyz++)
     {
         iy = iyz/local_ndata[ZZ];
         iz = iyz - iy*local_ndata[ZZ];
@@ -1989,8 +2044,8 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
              * depend on kx for triclinic unit cells.
              */
 
-                /* Two explicit loops to avoid a conditional inside the loop */
-            for(kx=kxstart; kx<maxkx; kx++)
+            /* Two explicit loops to avoid a conditional inside the loop */
+            for (kx = kxstart; kx < maxkx; kx++)
             {
                 mx = kx;
 
@@ -2006,7 +2061,7 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
                 tmp1[kx]  = -factor*m2k;
             }
 
-            for(kx=maxkx; kx<kxend; kx++)
+            for (kx = maxkx; kx < kxend; kx++)
             {
                 mx = (kx - nx);
 
@@ -2022,14 +2077,14 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
                 tmp1[kx]  = -factor*m2k;
             }
 
-            for(kx=kxstart; kx<kxend; kx++)
+            for (kx = kxstart; kx < kxend; kx++)
             {
                 m2inv[kx] = 1.0/m2[kx];
             }
 
-            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
+            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
 
-            for(kx=kxstart; kx<kxend; kx++,p0++)
+            for (kx = kxstart; kx < kxend; kx++, p0++)
             {
                 d1      = p0->re;
                 d2      = p0->im;
@@ -2042,7 +2097,7 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
                 tmp1[kx] = eterm[kx]*struct2;
             }
 
-            for(kx=kxstart; kx<kxend; kx++)
+            for (kx = kxstart; kx < kxend; kx++)
             {
                 ets2     = corner_fac*tmp1[kx];
                 vfactor  = (factor*m2[kx] + 1.0)*2.0*m2inv[kx];
@@ -2065,7 +2120,7 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
 
             /* Two explicit loops to avoid a conditional inside the loop */
 
-            for(kx=kxstart; kx<maxkx; kx++)
+            for (kx = kxstart; kx < maxkx; kx++)
             {
                 mx = kx;
 
@@ -2077,7 +2132,7 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
                 tmp1[kx]  = -factor*m2k;
             }
 
-            for(kx=maxkx; kx<kxend; kx++)
+            for (kx = maxkx; kx < kxend; kx++)
             {
                 mx = (kx - nx);
 
@@ -2089,9 +2144,9 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
                 tmp1[kx]  = -factor*m2k;
             }
 
-            calc_exponentials(kxstart,kxend,elfac,denom,tmp1,eterm);
+            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
 
-            for(kx=kxstart; kx<kxend; kx++,p0++)
+            for (kx = kxstart; kx < kxend; kx++, p0++)
             {
                 d1      = p0->re;
                 d2      = p0->im;
@@ -2125,8 +2180,8 @@ static int solve_pme_yzx(gmx_pme_t pme,t_complex *grid,
     return local_ndata[YY]*local_ndata[XX];
 }
 
-static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
-                             real *mesh_energy,matrix vir)
+static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
+                             real *mesh_energy, matrix vir)
 {
     /* This function sums output over threads
      * and should therefore only be called after thread synchronization.
@@ -2134,58 +2189,58 @@ static void get_pme_ener_vir(const gmx_pme_t pme,int nthread,
     int thread;
 
     *mesh_energy = pme->work[0].energy;
-    copy_mat(pme->work[0].vir,vir);
+    copy_mat(pme->work[0].vir, vir);
 
-    for(thread=1; thread<nthread; thread++)
+    for (thread = 1; thread < nthread; thread++)
     {
         *mesh_energy += pme->work[thread].energy;
-        m_add(vir,pme->work[thread].vir,vir);
+        m_add(vir, pme->work[thread].vir, vir);
     }
 }
 
 #define DO_FSPLINE(order)                      \
-for(ithx=0; (ithx<order); ithx++)              \
-{                                              \
-    index_x = (i0+ithx)*pny*pnz;               \
-    tx      = thx[ithx];                       \
-    dx      = dthx[ithx];                      \
+    for (ithx = 0; (ithx < order); ithx++)              \
+    {                                              \
+        index_x = (i0+ithx)*pny*pnz;               \
+        tx      = thx[ithx];                       \
+        dx      = dthx[ithx];                      \
                                                \
-    for(ithy=0; (ithy<order); ithy++)          \
-    {                                          \
-        index_xy = index_x+(j0+ithy)*pnz;      \
-        ty       = thy[ithy];                  \
-        dy       = dthy[ithy];                 \
-        fxy1     = fz1 = 0;                    \
+        for (ithy = 0; (ithy < order); ithy++)          \
+        {                                          \
+            index_xy = index_x+(j0+ithy)*pnz;      \
+            ty       = thy[ithy];                  \
+            dy       = dthy[ithy];                 \
+            fxy1     = fz1 = 0;                    \
                                                \
-        for(ithz=0; (ithz<order); ithz++)      \
-        {                                      \
-            gval  = grid[index_xy+(k0+ithz)];  \
-            fxy1 += thz[ithz]*gval;            \
-            fz1  += dthz[ithz]*gval;           \
-        }                                      \
-        fx += dx*ty*fxy1;                      \
-        fy += tx*dy*fxy1;                      \
-        fz += tx*ty*fz1;                       \
-    }                                          \
-}
+            for (ithz = 0; (ithz < order); ithz++)      \
+            {                                      \
+                gval  = grid[index_xy+(k0+ithz)];  \
+                fxy1 += thz[ithz]*gval;            \
+                fz1  += dthz[ithz]*gval;           \
+            }                                      \
+            fx += dx*ty*fxy1;                      \
+            fy += tx*dy*fxy1;                      \
+            fz += tx*ty*fz1;                       \
+        }                                          \
+    }
 
 
-static void gather_f_bsplines(gmx_pme_t pme,real *grid,
-                              gmx_bool bClearF,pme_atomcomm_t *atc,
+static void gather_f_bsplines(gmx_pme_t pme, real *grid,
+                              gmx_bool bClearF, pme_atomcomm_t *atc,
                               splinedata_t *spline,
                               real scale)
 {
     /* sum forces for local particles */
-    int     nn,n,ithx,ithy,ithz,i0,j0,k0;
-    int     index_x,index_xy;
-    int     nx,ny,nz,pnx,pny,pnz;
+    int     nn, n, ithx, ithy, ithz, i0, j0, k0;
+    int     index_x, index_xy;
+    int     nx, ny, nz, pnx, pny, pnz;
     int *   idxptr;
-    real    tx,ty,dx,dy,qn;
-    real    fx,fy,fz,gval;
-    real    fxy1,fz1;
-    real    *thx,*thy,*thz,*dthx,*dthy,*dthz;
+    real    tx, ty, dx, dy, qn;
+    real    fx, fy, fz, gval;
+    real    fxy1, fz1;
+    real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
     int     norder;
-    real    rxx,ryx,ryy,rzx,rzy,rzz;
+    real    rxx, ryx, ryy, rzx, rzy, rzz;
     int     order;
 
     pme_spline_work_t *work;
@@ -2213,7 +2268,7 @@ static void gather_f_bsplines(gmx_pme_t pme,real *grid,
     rzy   = pme->recipbox[ZZ][YY];
     rzz   = pme->recipbox[ZZ][ZZ];
 
-    for(nn=0; nn<spline->n; nn++)
+    for (nn = 0; nn < spline->n; nn++)
     {
         n  = spline->ind[nn];
         qn = scale*atc->q[n];
@@ -2244,8 +2299,9 @@ static void gather_f_bsplines(gmx_pme_t pme,real *grid,
             dthy = spline->dtheta[YY] + norder;
             dthz = spline->dtheta[ZZ] + norder;
 
-            switch (order) {
-            case 4:
+            switch (order)
+            {
+                case 4:
 #ifdef PME_SSE
 #ifdef PME_SSE_UNALIGNED
 #define PME_GATHER_F_SSE_ORDER4
@@ -2255,21 +2311,21 @@ static void gather_f_bsplines(gmx_pme_t pme,real *grid,
 #endif
 #include "pme_sse_single.h"
 #else
-                DO_FSPLINE(4);
+                    DO_FSPLINE(4);
 #endif
-                break;
-            case 5:
+                    break;
+                case 5:
 #ifdef PME_SSE
 #define PME_GATHER_F_SSE_ALIGNED
 #define PME_ORDER 5
 #include "pme_sse_single.h"
 #else
-                DO_FSPLINE(5);
+                    DO_FSPLINE(5);
 #endif
-                break;
-            default:
-                DO_FSPLINE(order);
-                break;
+                    break;
+                default:
+                    DO_FSPLINE(order);
+                    break;
             }
 
             atc->f[n][XX] += -qn*( fx*nx*rxx );
@@ -2289,15 +2345,15 @@ static void gather_f_bsplines(gmx_pme_t pme,real *grid,
 }
 
 
-static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
+static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
                                    pme_atomcomm_t *atc)
 {
     splinedata_t *spline;
-    int     n,ithx,ithy,ithz,i0,j0,k0;
-    int     index_x,index_xy;
+    int     n, ithx, ithy, ithz, i0, j0, k0;
+    int     index_x, index_xy;
     int *   idxptr;
-    real    energy,pot,tx,ty,qn,gval;
-    real    *thx,*thy,*thz;
+    real    energy, pot, tx, ty, qn, gval;
+    real    *thx, *thy, *thz;
     int     norder;
     int     order;
 
@@ -2306,10 +2362,12 @@ static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
     order = pme->pme_order;
 
     energy = 0;
-    for(n=0; (n<atc->n); n++) {
+    for (n = 0; (n < atc->n); n++)
+    {
         qn      = atc->q[n];
 
-        if (qn != 0) {
+        if (qn != 0)
+        {
             idxptr = atc->idx[n];
             norder = n*order;
 
@@ -2323,17 +2381,17 @@ static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
             thz  = spline->theta[ZZ] + norder;
 
             pot = 0;
-            for(ithx=0; (ithx<order); ithx++)
+            for (ithx = 0; (ithx < order); ithx++)
             {
                 index_x = (i0+ithx)*pme->pmegrid_ny*pme->pmegrid_nz;
                 tx      = thx[ithx];
 
-                for(ithy=0; (ithy<order); ithy++)
+                for (ithy = 0; (ithy < order); ithy++)
                 {
                     index_xy = index_x+(j0+ithy)*pme->pmegrid_nz;
                     ty       = thy[ithy];
 
-                    for(ithz=0; (ithz<order); ithz++)
+                    for (ithz = 0; (ithz < order); ithz++)
                     {
                         gval  = grid[index_xy+(k0+ithz)];
                         pot  += tx*ty*thz[ithz]*gval;
@@ -2353,155 +2411,174 @@ static real gather_energy_bsplines(gmx_pme_t pme,real *grid,
  * This gives a significant performance gain.
  */
 #define CALC_SPLINE(order)                     \
-{                                              \
-    int j,k,l;                                 \
-    real dr,div;                               \
-    real data[PME_ORDER_MAX];                  \
-    real ddata[PME_ORDER_MAX];                 \
+    {                                              \
+        int j, k, l;                                 \
+        real dr, div;                               \
+        real data[PME_ORDER_MAX];                  \
+        real ddata[PME_ORDER_MAX];                 \
                                                \
-    for(j=0; (j<DIM); j++)                     \
-    {                                          \
-        dr  = xptr[j];                         \
+        for (j = 0; (j < DIM); j++)                     \
+        {                                          \
+            dr  = xptr[j];                         \
                                                \
-        /* dr is relative offset from lower cell limit */ \
-        data[order-1] = 0;                     \
-        data[1] = dr;                          \
-        data[0] = 1 - dr;                      \
+            /* dr is relative offset from lower cell limit */ \
+            data[order-1] = 0;                     \
+            data[1]       = dr;                          \
+            data[0]       = 1 - dr;                      \
                                                \
-        for(k=3; (k<order); k++)               \
-        {                                      \
-            div = 1.0/(k - 1.0);               \
-            data[k-1] = div*dr*data[k-2];      \
-            for(l=1; (l<(k-1)); l++)           \
-            {                                  \
-                data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
-                                   data[k-l-1]);                \
-            }                                  \
-            data[0] = div*(1-dr)*data[0];      \
-        }                                      \
-        /* differentiate */                    \
-        ddata[0] = -data[0];                   \
-        for(k=1; (k<order); k++)               \
-        {                                      \
-            ddata[k] = data[k-1] - data[k];    \
-        }                                      \
+            for (k = 3; (k < order); k++)               \
+            {                                      \
+                div       = 1.0/(k - 1.0);               \
+                data[k-1] = div*dr*data[k-2];      \
+                for (l = 1; (l < (k-1)); l++)           \
+                {                                  \
+                    data[k-l-1] = div*((dr+l)*data[k-l-2]+(k-l-dr)* \
+                                       data[k-l-1]);                \
+                }                                  \
+                data[0] = div*(1-dr)*data[0];      \
+            }                                      \
+            /* differentiate */                    \
+            ddata[0] = -data[0];                   \
+            for (k = 1; (k < order); k++)               \
+            {                                      \
+                ddata[k] = data[k-1] - data[k];    \
+            }                                      \
                                                \
-        div = 1.0/(order - 1);                 \
-        data[order-1] = div*dr*data[order-2];  \
-        for(l=1; (l<(order-1)); l++)           \
-        {                                      \
-            data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
-                               (order-l-dr)*data[order-l-1]); \
-        }                                      \
-        data[0] = div*(1 - dr)*data[0];        \
+            div           = 1.0/(order - 1);                 \
+            data[order-1] = div*dr*data[order-2];  \
+            for (l = 1; (l < (order-1)); l++)           \
+            {                                      \
+                data[order-l-1] = div*((dr+l)*data[order-l-2]+    \
+                                       (order-l-dr)*data[order-l-1]); \
+            }                                      \
+            data[0] = div*(1 - dr)*data[0];        \
                                                \
-        for(k=0; k<order; k++)                 \
-        {                                      \
-            theta[j][i*order+k]  = data[k];    \
-            dtheta[j][i*order+k] = ddata[k];   \
-        }                                      \
-    }                                          \
-}
+            for (k = 0; k < order; k++)                 \
+            {                                      \
+                theta[j][i*order+k]  = data[k];    \
+                dtheta[j][i*order+k] = ddata[k];   \
+            }                                      \
+        }                                          \
+    }
 
-void make_bsplines(splinevec theta,splinevec dtheta,int order,
-                   rvec fractx[],int nr,int ind[],real charge[],
+void make_bsplines(splinevec theta, splinevec dtheta, int order,
+                   rvec fractx[], int nr, int ind[], real charge[],
                    gmx_bool bFreeEnergy)
 {
     /* construct splines for local atoms */
-    int  i,ii;
+    int  i, ii;
     real *xptr;
 
-    for(i=0; i<nr; i++)
+    for (i = 0; i < nr; i++)
     {
         /* With free energy we do not use the charge check.
          * In most cases this will be more efficient than calling make_bsplines
          * twice, since usually more than half the particles have charges.
          */
         ii = ind[i];
-        if (bFreeEnergy || charge[ii] != 0.0) {
+        if (bFreeEnergy || charge[ii] != 0.0)
+        {
             xptr = fractx[ii];
-            switch(order) {
-            case 4:  CALC_SPLINE(4);     break;
-            case 5:  CALC_SPLINE(5);     break;
-            default: CALC_SPLINE(order); break;
+            switch (order)
+            {
+                case 4:  CALC_SPLINE(4);     break;
+                case 5:  CALC_SPLINE(5);     break;
+                default: CALC_SPLINE(order); break;
             }
         }
     }
 }
 
 
-void make_dft_mod(real *mod,real *data,int ndata)
+void make_dft_mod(real *mod, real *data, int ndata)
 {
-  int i,j;
-  real sc,ss,arg;
-
-  for(i=0;i<ndata;i++) {
-    sc=ss=0;
-    for(j=0;j<ndata;j++) {
-      arg=(2.0*M_PI*i*j)/ndata;
-      sc+=data[j]*cos(arg);
-      ss+=data[j]*sin(arg);
-    }
-    mod[i]=sc*sc+ss*ss;
-  }
-  for(i=0;i<ndata;i++)
-    if(mod[i]<1e-7)
-      mod[i]=(mod[i-1]+mod[i+1])*0.5;
+    int i, j;
+    real sc, ss, arg;
+
+    for (i = 0; i < ndata; i++)
+    {
+        sc = ss = 0;
+        for (j = 0; j < ndata; j++)
+        {
+            arg = (2.0*M_PI*i*j)/ndata;
+            sc += data[j]*cos(arg);
+            ss += data[j]*sin(arg);
+        }
+        mod[i] = sc*sc+ss*ss;
+    }
+    for (i = 0; i < ndata; i++)
+    {
+        if (mod[i] < 1e-7)
+        {
+            mod[i] = (mod[i-1]+mod[i+1])*0.5;
+        }
+    }
 }
 
 
 static void make_bspline_moduli(splinevec bsp_mod,
-                                int nx,int ny,int nz,int order)
+                                int nx, int ny, int nz, int order)
 {
-  int nmax=max(nx,max(ny,nz));
-  real *data,*ddata,*bsp_data;
-  int i,k,l;
-  real div;
-
-  snew(data,order);
-  snew(ddata,order);
-  snew(bsp_data,nmax);
-
-  data[order-1]=0;
-  data[1]=0;
-  data[0]=1;
-
-  for(k=3;k<order;k++) {
-    div=1.0/(k-1.0);
-    data[k-1]=0;
-    for(l=1;l<(k-1);l++)
-      data[k-l-1]=div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
-    data[0]=div*data[0];
-  }
-  /* differentiate */
-  ddata[0]=-data[0];
-  for(k=1;k<order;k++)
-    ddata[k]=data[k-1]-data[k];
-  div=1.0/(order-1);
-  data[order-1]=0;
-  for(l=1;l<(order-1);l++)
-    data[order-l-1]=div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
-  data[0]=div*data[0];
-
-  for(i=0;i<nmax;i++)
-    bsp_data[i]=0;
-  for(i=1;i<=order;i++)
-    bsp_data[i]=data[i-1];
-
-  make_dft_mod(bsp_mod[XX],bsp_data,nx);
-  make_dft_mod(bsp_mod[YY],bsp_data,ny);
-  make_dft_mod(bsp_mod[ZZ],bsp_data,nz);
-
-  sfree(data);
-  sfree(ddata);
-  sfree(bsp_data);
+    int nmax = max(nx, max(ny, nz));
+    real *data, *ddata, *bsp_data;
+    int i, k, l;
+    real div;
+
+    snew(data, order);
+    snew(ddata, order);
+    snew(bsp_data, nmax);
+
+    data[order-1] = 0;
+    data[1]       = 0;
+    data[0]       = 1;
+
+    for (k = 3; k < order; k++)
+    {
+        div       = 1.0/(k-1.0);
+        data[k-1] = 0;
+        for (l = 1; l < (k-1); l++)
+        {
+            data[k-l-1] = div*(l*data[k-l-2]+(k-l)*data[k-l-1]);
+        }
+        data[0] = div*data[0];
+    }
+    /* differentiate */
+    ddata[0] = -data[0];
+    for (k = 1; k < order; k++)
+    {
+        ddata[k] = data[k-1]-data[k];
+    }
+    div           = 1.0/(order-1);
+    data[order-1] = 0;
+    for (l = 1; l < (order-1); l++)
+    {
+        data[order-l-1] = div*(l*data[order-l-2]+(order-l)*data[order-l-1]);
+    }
+    data[0] = div*data[0];
+
+    for (i = 0; i < nmax; i++)
+    {
+        bsp_data[i] = 0;
+    }
+    for (i = 1; i <= order; i++)
+    {
+        bsp_data[i] = data[i-1];
+    }
+
+    make_dft_mod(bsp_mod[XX], bsp_data, nx);
+    make_dft_mod(bsp_mod[YY], bsp_data, ny);
+    make_dft_mod(bsp_mod[ZZ], bsp_data, nz);
+
+    sfree(data);
+    sfree(ddata);
+    sfree(bsp_data);
 }
 
 
 /* Return the P3M optimal influence function */
 static double do_p3m_influence(double z, int order)
 {
-    double z2,z4;
+    double z2, z4;
 
     z2 = z*z;
     z4 = z2*z2;
@@ -2509,106 +2586,109 @@ static double do_p3m_influence(double z, int order)
     /* The formula and most constants can be found in:
      * Ballenegger et al., JCTC 8, 936 (2012)
      */
-    switch(order)
-    {
-    case 2:
-        return 1.0 - 2.0*z2/3.0;
-        break;
-    case 3:
-        return 1.0 - z2 + 2.0*z4/15.0;
-        break;
-    case 4:
-        return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
-        break;
-    case 5:
-        return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
-        break;
-    case 6:
-        return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
-        break;
-    case 7:
-        return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
-    case 8:
-        return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
-        break;
+    switch (order)
+    {
+        case 2:
+            return 1.0 - 2.0*z2/3.0;
+            break;
+        case 3:
+            return 1.0 - z2 + 2.0*z4/15.0;
+            break;
+        case 4:
+            return 1.0 - 4.0*z2/3.0 + 2.0*z4/5.0 + 4.0*z2*z4/315.0;
+            break;
+        case 5:
+            return 1.0 - 5.0*z2/3.0 + 7.0*z4/9.0 - 17.0*z2*z4/189.0 + 2.0*z4*z4/2835.0;
+            break;
+        case 6:
+            return 1.0 - 2.0*z2 + 19.0*z4/15.0 - 256.0*z2*z4/945.0 + 62.0*z4*z4/4725.0 + 4.0*z2*z4*z4/155925.0;
+            break;
+        case 7:
+            return 1.0 - 7.0*z2/3.0 + 28.0*z4/15.0 - 16.0*z2*z4/27.0 + 26.0*z4*z4/405.0 - 2.0*z2*z4*z4/1485.0 + 4.0*z4*z4*z4/6081075.0;
+        case 8:
+            return 1.0 - 8.0*z2/3.0 + 116.0*z4/45.0 - 344.0*z2*z4/315.0 + 914.0*z4*z4/4725.0 - 248.0*z4*z4*z2/22275.0 + 21844.0*z4*z4*z4/212837625.0 - 8.0*z4*z4*z4*z2/638512875.0;
+            break;
     }
 
     return 0.0;
 }
 
 /* Calculate the P3M B-spline moduli for one dimension */
-static void make_p3m_bspline_moduli_dim(real *bsp_mod,int n,int order)
+static void make_p3m_bspline_moduli_dim(real *bsp_mod, int n, int order)
 {
-    double zarg,zai,sinzai,infl;
-    int    maxk,i;
+    double zarg, zai, sinzai, infl;
+    int    maxk, i;
 
     if (order > 8)
     {
-        gmx_fatal(FARGS,"The current P3M code only supports orders up to 8");
+        gmx_fatal(FARGS, "The current P3M code only supports orders up to 8");
     }
 
     zarg = M_PI/n;
 
     maxk = (n + 1)/2;
 
-    for(i=-maxk; i<0; i++)
+    for (i = -maxk; i < 0; i++)
     {
-        zai    = zarg*i;
-        sinzai = sin(zai);
-        infl   = do_p3m_influence(sinzai,order);
-        bsp_mod[n+i] = infl*infl*pow(sinzai/zai,-2.0*order);
+        zai          = zarg*i;
+        sinzai       = sin(zai);
+        infl         = do_p3m_influence(sinzai, order);
+        bsp_mod[n+i] = infl*infl*pow(sinzai/zai, -2.0*order);
     }
     bsp_mod[0] = 1.0;
-    for(i=1; i<maxk; i++)
+    for (i = 1; i < maxk; i++)
     {
-        zai    = zarg*i;
-        sinzai = sin(zai);
-        infl   = do_p3m_influence(sinzai,order);
-        bsp_mod[i] = infl*infl*pow(sinzai/zai,-2.0*order);
+        zai        = zarg*i;
+        sinzai     = sin(zai);
+        infl       = do_p3m_influence(sinzai, order);
+        bsp_mod[i] = infl*infl*pow(sinzai/zai, -2.0*order);
     }
 }
 
 /* Calculate the P3M B-spline moduli */
 static void make_p3m_bspline_moduli(splinevec bsp_mod,
-                                    int nx,int ny,int nz,int order)
+                                    int nx, int ny, int nz, int order)
 {
-    make_p3m_bspline_moduli_dim(bsp_mod[XX],nx,order);
-    make_p3m_bspline_moduli_dim(bsp_mod[YY],ny,order);
-    make_p3m_bspline_moduli_dim(bsp_mod[ZZ],nz,order);
+    make_p3m_bspline_moduli_dim(bsp_mod[XX], nx, order);
+    make_p3m_bspline_moduli_dim(bsp_mod[YY], ny, order);
+    make_p3m_bspline_moduli_dim(bsp_mod[ZZ], nz, order);
 }
 
 
 static void setup_coordinate_communication(pme_atomcomm_t *atc)
 {
-  int nslab,n,i;
-  int fw,bw;
-
-  nslab = atc->nslab;
-
-  n = 0;
-  for(i=1; i<=nslab/2; i++) {
-    fw = (atc->nodeid + i) % nslab;
-    bw = (atc->nodeid - i + nslab) % nslab;
-    if (n < nslab - 1) {
-      atc->node_dest[n] = fw;
-      atc->node_src[n]  = bw;
-      n++;
-    }
-    if (n < nslab - 1) {
-      atc->node_dest[n] = bw;
-      atc->node_src[n]  = fw;
-      n++;
-    }
-  }
+    int nslab, n, i;
+    int fw, bw;
+
+    nslab = atc->nslab;
+
+    n = 0;
+    for (i = 1; i <= nslab/2; i++)
+    {
+        fw = (atc->nodeid + i) % nslab;
+        bw = (atc->nodeid - i + nslab) % nslab;
+        if (n < nslab - 1)
+        {
+            atc->node_dest[n] = fw;
+            atc->node_src[n]  = bw;
+            n++;
+        }
+        if (n < nslab - 1)
+        {
+            atc->node_dest[n] = bw;
+            atc->node_src[n]  = fw;
+            n++;
+        }
+    }
 }
 
-int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
+int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
 {
     int thread;
 
-    if(NULL != log)
+    if (NULL != log)
     {
-        fprintf(log,"Destroying PME data structures.\n");
+        fprintf(log, "Destroying PME data structures.\n");
     }
 
     sfree((*pmedata)->nnx);
@@ -2628,7 +2708,7 @@ int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
         sfree((*pmedata)->cfftgridB);
         gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
     }
-    for(thread=0; thread<(*pmedata)->nthread; thread++)
+    for (thread = 0; thread < (*pmedata)->nthread; thread++)
     {
         free_work(&(*pmedata)->work[thread]);
     }
@@ -2637,10 +2717,10 @@ int gmx_pme_destroy(FILE *log,gmx_pme_t *pmedata)
     sfree(*pmedata);
     *pmedata = NULL;
 
-  return 0;
+    return 0;
 }
 
-static int mult_up(int n,int f)
+static int mult_up(int n, int f)
 {
     return ((n + f - 1)/f)*f;
 }
@@ -2648,40 +2728,40 @@ static int mult_up(int n,int f)
 
 static double pme_load_imbalance(gmx_pme_t pme)
 {
-    int    nma,nmi;
-    double n1,n2,n3;
+    int    nma, nmi;
+    double n1, n2, n3;
 
     nma = pme->nnodes_major;
     nmi = pme->nnodes_minor;
 
-    n1 = mult_up(pme->nkx,nma)*mult_up(pme->nky,nmi)*pme->nkz;
-    n2 = mult_up(pme->nkx,nma)*mult_up(pme->nkz,nmi)*pme->nky;
-    n3 = mult_up(pme->nky,nma)*mult_up(pme->nkz,nmi)*pme->nkx;
+    n1 = mult_up(pme->nkx, nma)*mult_up(pme->nky, nmi)*pme->nkz;
+    n2 = mult_up(pme->nkx, nma)*mult_up(pme->nkz, nmi)*pme->nky;
+    n3 = mult_up(pme->nky, nma)*mult_up(pme->nkz, nmi)*pme->nkx;
 
     /* pme_solve is roughly double the cost of an fft */
 
     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 }
 
-static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
-                          int dimind,gmx_bool bSpread)
+static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
+                          int dimind, gmx_bool bSpread)
 {
-    int nk,k,s,thread;
+    int nk, k, s, thread;
 
-    atc->dimind = dimind;
-    atc->nslab  = 1;
-    atc->nodeid = 0;
+    atc->dimind    = dimind;
+    atc->nslab     = 1;
+    atc->nodeid    = 0;
     atc->pd_nalloc = 0;
 #ifdef GMX_MPI
     if (pme->nnodes > 1)
     {
         atc->mpi_comm = pme->mpi_comm_d[dimind];
-        MPI_Comm_size(atc->mpi_comm,&atc->nslab);
-        MPI_Comm_rank(atc->mpi_comm,&atc->nodeid);
+        MPI_Comm_size(atc->mpi_comm, &atc->nslab);
+        MPI_Comm_rank(atc->mpi_comm, &atc->nodeid);
     }
     if (debug)
     {
-        fprintf(debug,"For PME atom communication in dimind %d: nslab %d rank %d\n",atc->dimind,atc->nslab,atc->nodeid);
+        fprintf(debug, "For PME atom communication in dimind %d: nslab %d rank %d\n", atc->dimind, atc->nslab, atc->nodeid);
     }
 #endif
 
@@ -2691,34 +2771,34 @@ static void init_atomcomm(gmx_pme_t pme,pme_atomcomm_t *atc, t_commrec *cr,
     if (atc->nslab > 1)
     {
         /* These three allocations are not required for particle decomp. */
-        snew(atc->node_dest,atc->nslab);
-        snew(atc->node_src,atc->nslab);
+        snew(atc->node_dest, atc->nslab);
+        snew(atc->node_src, atc->nslab);
         setup_coordinate_communication(atc);
 
-        snew(atc->count_thread,pme->nthread);
-        for(thread=0; thread<pme->nthread; thread++)
+        snew(atc->count_thread, pme->nthread);
+        for (thread = 0; thread < pme->nthread; thread++)
         {
-            snew(atc->count_thread[thread],atc->nslab);
+            snew(atc->count_thread[thread], atc->nslab);
         }
         atc->count = atc->count_thread[0];
-        snew(atc->rcount,atc->nslab);
-        snew(atc->buf_index,atc->nslab);
+        snew(atc->rcount, atc->nslab);
+        snew(atc->buf_index, atc->nslab);
     }
 
     atc->nthread = pme->nthread;
     if (atc->nthread > 1)
     {
-        snew(atc->thread_plist,atc->nthread);
+        snew(atc->thread_plist, atc->nthread);
     }
-    snew(atc->spline,atc->nthread);
-    for(thread=0; thread<atc->nthread; thread++)
+    snew(atc->spline, atc->nthread);
+    for (thread = 0; thread < atc->nthread; thread++)
     {
         if (atc->nthread > 1)
         {
-            snew(atc->thread_plist[thread].n,atc->nthread+2*GMX_CACHE_SEP);
+            snew(atc->thread_plist[thread].n, atc->nthread+2*GMX_CACHE_SEP);
             atc->thread_plist[thread].n += GMX_CACHE_SEP;
         }
-        snew(atc->spline[thread].thread_one,pme->nthread);
+        snew(atc->spline[thread].thread_one, pme->nthread);
         atc->spline[thread].thread_one[thread] = 1;
     }
 }
@@ -2734,12 +2814,12 @@ init_overlap_comm(pme_overlap_t *  ol,
                   int              ndata,
                   int              commplainsize)
 {
-    int lbnd,rbnd,maxlr,b,i;
+    int lbnd, rbnd, maxlr, b, i;
     int exten;
-    int nn,nk;
+    int nn, nk;
     pme_grid_comm_t *pgc;
     gmx_bool bCont;
-    int fft_start,fft_end,send_index1,recv_index1;
+    int fft_start, fft_end, send_index1, recv_index1;
 #ifdef GMX_MPI
     MPI_Status stat;
 
@@ -2756,10 +2836,13 @@ init_overlap_comm(pme_overlap_t *  ol,
      * that belong to higher nodes (modulo nnodes)
      */
 
-    snew(ol->s2g0,ol->nnodes+1);
-    snew(ol->s2g1,ol->nnodes);
-    if (debug) { fprintf(debug,"PME slab boundaries:"); }
-    for(i=0; i<nnodes; i++)
+    snew(ol->s2g0, ol->nnodes+1);
+    snew(ol->s2g1, ol->nnodes);
+    if (debug)
+    {
+        fprintf(debug, "PME slab boundaries:");
+    }
+    for (i = 0; i < nnodes; i++)
     {
         /* s2g0 the local interpolation grid start.
          * s2g1 the local interpolation grid end.
@@ -2771,11 +2854,14 @@ init_overlap_comm(pme_overlap_t *  ol,
 
         if (debug)
         {
-            fprintf(debug,"  %3d %3d",ol->s2g0[i],ol->s2g1[i]);
+            fprintf(debug, "  %3d %3d", ol->s2g0[i], ol->s2g1[i]);
         }
     }
     ol->s2g0[nnodes] = ndata;
-    if (debug) { fprintf(debug,"\n"); }
+    if (debug)
+    {
+        fprintf(debug, "\n");
+    }
 
     /* Determine with how many nodes we need to communicate the grid overlap */
     b = 0;
@@ -2783,7 +2869,7 @@ init_overlap_comm(pme_overlap_t *  ol,
     {
         b++;
         bCont = FALSE;
-        for(i=0; i<nnodes; i++)
+        for (i = 0; i < nnodes; i++)
         {
             if ((i+b <  nnodes && ol->s2g1[i] > ol->s2g0[i+b]) ||
                 (i+b >= nnodes && ol->s2g1[i] > ol->s2g0[i+b-nnodes] + ndata))
@@ -2795,9 +2881,9 @@ init_overlap_comm(pme_overlap_t *  ol,
     while (bCont && b < nnodes);
     ol->noverlap_nodes = b - 1;
 
-    snew(ol->send_id,ol->noverlap_nodes);
-    snew(ol->recv_id,ol->noverlap_nodes);
-    for(b=0; b<ol->noverlap_nodes; b++)
+    snew(ol->send_id, ol->noverlap_nodes);
+    snew(ol->recv_id, ol->noverlap_nodes);
+    for (b = 0; b < ol->noverlap_nodes; b++)
     {
         ol->send_id[b] = (ol->nodeid + (b + 1)) % ol->nnodes;
         ol->recv_id[b] = (ol->nodeid - (b + 1) + ol->nnodes) % ol->nnodes;
@@ -2805,7 +2891,7 @@ init_overlap_comm(pme_overlap_t *  ol,
     snew(ol->comm_data, ol->noverlap_nodes);
 
     ol->send_size = 0;
-    for(b=0; b<ol->noverlap_nodes; b++)
+    for (b = 0; b < ol->noverlap_nodes; b++)
     {
         pgc = &ol->comm_data[b];
         /* Send */
@@ -2816,10 +2902,10 @@ init_overlap_comm(pme_overlap_t *  ol,
             fft_start += ndata;
             fft_end   += ndata;
         }
-        send_index1      = ol->s2g1[nodeid];
-        send_index1      = min(send_index1,fft_end);
-        pgc->send_index0 = fft_start;
-        pgc->send_nindex = max(0,send_index1 - pgc->send_index0);
+        send_index1       = ol->s2g1[nodeid];
+        send_index1       = min(send_index1, fft_end);
+        pgc->send_index0  = fft_start;
+        pgc->send_nindex  = max(0, send_index1 - pgc->send_index0);
         ol->send_size    += pgc->send_nindex;
 
         /* We always start receiving to the first index of our slab */
@@ -2830,28 +2916,28 @@ init_overlap_comm(pme_overlap_t *  ol,
         {
             recv_index1 -= ndata;
         }
-        recv_index1      = min(recv_index1,fft_end);
+        recv_index1      = min(recv_index1, fft_end);
         pgc->recv_index0 = fft_start;
-        pgc->recv_nindex = max(0,recv_index1 - pgc->recv_index0);
+        pgc->recv_nindex = max(0, recv_index1 - pgc->recv_index0);
     }
 
 #ifdef GMX_MPI
     /* Communicate the buffer sizes to receive */
-    for(b=0; b<ol->noverlap_nodes; b++)
+    for (b = 0; b < ol->noverlap_nodes; b++)
     {
-        MPI_Sendrecv(&ol->send_size             ,1,MPI_INT,ol->send_id[b],b,
-                     &ol->comm_data[b].recv_size,1,MPI_INT,ol->recv_id[b],b,
-                     ol->mpi_comm,&stat);
+        MPI_Sendrecv(&ol->send_size, 1, MPI_INT, ol->send_id[b], b,
+                     &ol->comm_data[b].recv_size, 1, MPI_INT, ol->recv_id[b], b,
+                     ol->mpi_comm, &stat);
     }
 #endif
 
     /* For non-divisible grid we need pme_order iso pme_order-1 */
-    snew(ol->sendbuf,norder*commplainsize);
-    snew(ol->recvbuf,norder*commplainsize);
+    snew(ol->sendbuf, norder*commplainsize);
+    snew(ol->recvbuf, norder*commplainsize);
 }
 
 static void
-make_gridindex5_to_localindex(int n,int local_start,int local_range,
+make_gridindex5_to_localindex(int n, int local_start, int local_range,
                               int **global_to_local,
                               real **fraction_shift)
 {
@@ -2859,9 +2945,9 @@ make_gridindex5_to_localindex(int n,int local_start,int local_range,
     int * gtl;
     real * fsh;
 
-    snew(gtl,5*n);
-    snew(fsh,5*n);
-    for(i=0; (i<5*n); i++)
+    snew(gtl, 5*n);
+    snew(fsh, 5*n);
+    for (i = 0; (i < 5*n); i++)
     {
         /* Determine the global to local grid index */
         gtl[i] = (i - local_start + n) % n;
@@ -2909,9 +2995,9 @@ static pme_spline_work_t *make_pme_spline_work(int order)
 #ifdef PME_SSE
     float  tmp[8];
     __m128 zero_SSE;
-    int    of,i;
+    int    of, i;
 
-    snew_aligned(work,1,16);
+    snew_aligned(work, 1, 16);
 
     zero_SSE = _mm_setzero_ps();
 
@@ -2919,16 +3005,16 @@ static pme_spline_work_t *make_pme_spline_work(int order)
      * as we only operate on order of the 8 grid entries that are
      * load into 2 SSE float registers.
      */
-    for(of=0; of<8-(order-1); of++)
+    for (of = 0; of < 8-(order-1); of++)
     {
-        for(i=0; i<8; i++)
+        for (i = 0; i < 8; i++)
         {
             tmp[i] = (i >= of && i < of+order ? 1 : 0);
         }
         work->mask_SSE0[of] = _mm_loadu_ps(tmp);
         work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
-        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of],zero_SSE);
-        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of],zero_SSE);
+        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
+        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
     }
 #else
     work = NULL;
@@ -2938,7 +3024,7 @@ static pme_spline_work_t *make_pme_spline_work(int order)
 }
 
 static void
-gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
+gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
 {
     int nk_new;
 
@@ -2948,14 +3034,14 @@ gmx_pme_check_grid_restrictions(FILE *fplog,char dim,int nnodes,int *nk)
 
         if (2*nk_new >= 3*(*nk))
         {
-            gmx_fatal(FARGS,"The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
-                      dim,*nk,dim,nnodes,dim);
+            gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
+                      dim, *nk, dim, nnodes, dim);
         }
 
         if (fplog != NULL)
         {
-            fprintf(fplog,"\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
-                    dim,*nk,dim,nnodes,dim,nk_new,dim);
+            fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
+                    dim, *nk, dim, nnodes, dim, nk_new, dim);
         }
 
         *nk = nk_new;
@@ -2972,14 +3058,16 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                  gmx_bool            bReproducible,
                  int                 nthread)
 {
-    gmx_pme_t pme=NULL;
+    gmx_pme_t pme = NULL;
 
     pme_atomcomm_t *atc;
     ivec ndata;
 
     if (debug)
-        fprintf(debug,"Creating PME data structures.\n");
-    snew(pme,1);
+    {
+        fprintf(debug, "Creating PME data structures.\n");
+    }
+    snew(pme, 1);
 
     pme->redist_init         = FALSE;
     pme->sum_qgrid_tmp       = NULL;
@@ -2998,8 +3086,8 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     {
         pme->mpi_comm = cr->mpi_comm_mygroup;
 
-        MPI_Comm_rank(pme->mpi_comm,&pme->nodeid);
-        MPI_Comm_size(pme->mpi_comm,&pme->nnodes);
+        MPI_Comm_rank(pme->mpi_comm, &pme->nodeid);
+        MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
         if (pme->nnodes != nnodes_major*nnodes_minor)
         {
             gmx_incons("PME node count mismatch");
@@ -3017,7 +3105,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         pme->mpi_comm_d[0] = MPI_COMM_NULL;
         pme->mpi_comm_d[1] = MPI_COMM_NULL;
 #endif
-        pme->ndecompdim = 0;
+        pme->ndecompdim   = 0;
         pme->nodeid_major = 0;
         pme->nodeid_minor = 0;
 #ifdef GMX_MPI
@@ -3032,7 +3120,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
             pme->mpi_comm_d[0] = pme->mpi_comm;
             pme->mpi_comm_d[1] = MPI_COMM_NULL;
 #endif
-            pme->ndecompdim = 1;
+            pme->ndecompdim   = 1;
             pme->nodeid_major = pme->nodeid;
             pme->nodeid_minor = 0;
 
@@ -3043,7 +3131,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
             pme->mpi_comm_d[0] = MPI_COMM_NULL;
             pme->mpi_comm_d[1] = pme->mpi_comm;
 #endif
-            pme->ndecompdim = 1;
+            pme->ndecompdim   = 1;
             pme->nodeid_major = 0;
             pme->nodeid_minor = pme->nodeid;
         }
@@ -3056,15 +3144,15 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
             pme->ndecompdim = 2;
 
 #ifdef GMX_MPI
-            MPI_Comm_split(pme->mpi_comm,pme->nodeid % nnodes_minor,
-                           pme->nodeid,&pme->mpi_comm_d[0]);  /* My communicator along major dimension */
-            MPI_Comm_split(pme->mpi_comm,pme->nodeid/nnodes_minor,
-                           pme->nodeid,&pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
-
-            MPI_Comm_rank(pme->mpi_comm_d[0],&pme->nodeid_major);
-            MPI_Comm_size(pme->mpi_comm_d[0],&pme->nnodes_major);
-            MPI_Comm_rank(pme->mpi_comm_d[1],&pme->nodeid_minor);
-            MPI_Comm_size(pme->mpi_comm_d[1],&pme->nnodes_minor);
+            MPI_Comm_split(pme->mpi_comm, pme->nodeid % nnodes_minor,
+                           pme->nodeid, &pme->mpi_comm_d[0]);  /* My communicator along major dimension */
+            MPI_Comm_split(pme->mpi_comm, pme->nodeid/nnodes_minor,
+                           pme->nodeid, &pme->mpi_comm_d[1]);  /* My communicator along minor dimension */
+
+            MPI_Comm_rank(pme->mpi_comm_d[0], &pme->nodeid_major);
+            MPI_Comm_size(pme->mpi_comm_d[0], &pme->nnodes_major);
+            MPI_Comm_rank(pme->mpi_comm_d[1], &pme->nodeid_minor);
+            MPI_Comm_size(pme->mpi_comm_d[1], &pme->nnodes_minor);
 #endif
         }
         pme->bPPnode = (cr->duty & DUTY_PP);
@@ -3074,7 +3162,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
 
     if (ir->ePBC == epbcSCREW)
     {
-        gmx_fatal(FARGS,"pme does not (yet) work with pbc = screw");
+        gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
     }
 
     pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
@@ -3087,8 +3175,8 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
 
     if (pme->pme_order > PME_ORDER_MAX)
     {
-        gmx_fatal(FARGS,"pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
-                  pme->pme_order,PME_ORDER_MAX);
+        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
+                  pme->pme_order, PME_ORDER_MAX);
     }
 
     /* Currently pme.c supports only the fft5d FFT code.
@@ -3098,26 +3186,27 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
      * This check should be done before calling gmx_pme_init
      * and fplog should be passed iso stderr.
      *
-    if (pme->ndecompdim >= 2)
-    */
+       if (pme->ndecompdim >= 2)
+     */
     if (pme->ndecompdim >= 1)
     {
         /*
-        gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
+           gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
                                         'x',nnodes_major,&pme->nkx);
-        gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
+           gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
                                         'y',nnodes_minor,&pme->nky);
-        */
+         */
     }
 
     if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
         pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
         pme->nkz <= pme->pme_order)
     {
-        gmx_fatal(FARGS,"The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",pme->pme_order);
+        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
     }
 
-    if (pme->nnodes > 1) {
+    if (pme->nnodes > 1)
+    {
         double imbal;
 
 #ifdef GMX_MPI
@@ -3142,8 +3231,8 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                     "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
                     "\n",
                     (int)((imbal-1)*100 + 0.5),
-                    pme->nkx,pme->nky,pme->nnodes_major,
-                    pme->nky,pme->nkz,pme->nnodes_minor);
+                    pme->nkx, pme->nky, pme->nnodes_major,
+                    pme->nky, pme->nkz, pme->nnodes_minor);
         }
     }
 
@@ -3152,25 +3241,25 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
      * y is always copied through a buffer: we don't need padding in z,
      * but we do need the overlap in x because of the communication order.
      */
-    init_overlap_comm(&pme->overlap[0],pme->pme_order,
+    init_overlap_comm(&pme->overlap[0], pme->pme_order,
 #ifdef GMX_MPI
                       pme->mpi_comm_d[0],
 #endif
-                      pme->nnodes_major,pme->nodeid_major,
+                      pme->nnodes_major, pme->nodeid_major,
                       pme->nkx,
-                      (div_round_up(pme->nky,pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
+                      (div_round_up(pme->nky, pme->nnodes_minor)+pme->pme_order)*(pme->nkz+pme->pme_order-1));
 
     /* Along overlap dim 1 we can send in multiple pulses in sum_fftgrid_dd.
      * We do this with an offset buffer of equal size, so we need to allocate
      * extra for the offset. That's what the (+1)*pme->nkz is for.
      */
-    init_overlap_comm(&pme->overlap[1],pme->pme_order,
+    init_overlap_comm(&pme->overlap[1], pme->pme_order,
 #ifdef GMX_MPI
                       pme->mpi_comm_d[1],
 #endif
-                      pme->nnodes_minor,pme->nodeid_minor,
+                      pme->nnodes_minor, pme->nodeid_minor,
                       pme->nky,
-                      (div_round_up(pme->nkx,pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
+                      (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 
     /* Check for a limitation of the (current) sum_fftgrid_dd code.
      * We only allow multiple communication pulses in dim 1, not in dim 0.
@@ -3178,24 +3267,24 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
                              pme->nkx < pme->nnodes_major*pme->pme_order))
     {
-        gmx_fatal(FARGS,"The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
-                  pme->nkx/(double)pme->nnodes_major,pme->pme_order);
+        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+                  pme->nkx/(double)pme->nnodes_major, pme->pme_order);
     }
 
-    snew(pme->bsp_mod[XX],pme->nkx);
-    snew(pme->bsp_mod[YY],pme->nky);
-    snew(pme->bsp_mod[ZZ],pme->nkz);
+    snew(pme->bsp_mod[XX], pme->nkx);
+    snew(pme->bsp_mod[YY], pme->nky);
+    snew(pme->bsp_mod[ZZ], pme->nkz);
 
     /* The required size of the interpolation grid, including overlap.
      * The allocated size (pmegrid_n?) might be slightly larger.
      */
     pme->pmegrid_nx = pme->overlap[0].s2g1[pme->nodeid_major] -
-                      pme->overlap[0].s2g0[pme->nodeid_major];
+        pme->overlap[0].s2g0[pme->nodeid_major];
     pme->pmegrid_ny = pme->overlap[1].s2g1[pme->nodeid_minor] -
-                      pme->overlap[1].s2g0[pme->nodeid_minor];
+        pme->overlap[1].s2g0[pme->nodeid_minor];
     pme->pmegrid_nz_base = pme->nkz;
-    pme->pmegrid_nz = pme->pmegrid_nz_base + pme->pme_order - 1;
-    set_grid_alignment(&pme->pmegrid_nz,pme->pme_order);
+    pme->pmegrid_nz      = pme->pmegrid_nz_base + pme->pme_order - 1;
+    set_grid_alignment(&pme->pmegrid_nz, pme->pme_order);
 
     pme->pmegrid_start_ix = pme->overlap[0].s2g0[pme->nodeid_major];
     pme->pmegrid_start_iy = pme->overlap[1].s2g0[pme->nodeid_minor];
@@ -3204,18 +3293,18 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     make_gridindex5_to_localindex(pme->nkx,
                                   pme->pmegrid_start_ix,
                                   pme->pmegrid_nx - (pme->pme_order-1),
-                                  &pme->nnx,&pme->fshx);
+                                  &pme->nnx, &pme->fshx);
     make_gridindex5_to_localindex(pme->nky,
                                   pme->pmegrid_start_iy,
                                   pme->pmegrid_ny - (pme->pme_order-1),
-                                  &pme->nny,&pme->fshy);
+                                  &pme->nny, &pme->fshy);
     make_gridindex5_to_localindex(pme->nkz,
                                   pme->pmegrid_start_iz,
                                   pme->pmegrid_nz_base,
-                                  &pme->nnz,&pme->fshz);
+                                  &pme->nnz, &pme->fshz);
 
     pmegrids_init(&pme->pmegridA,
-                  pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
+                  pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
                   pme->pmegrid_nz_base,
                   pme->pme_order,
                   pme->nthread,
@@ -3229,27 +3318,27 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     ndata[2] = pme->nkz;
 
     /* This routine will allocate the grid data to fit the FFTs */
-    gmx_parallel_3dfft_init(&pme->pfft_setupA,ndata,
-                            &pme->fftgridA,&pme->cfftgridA,
+    gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
+                            &pme->fftgridA, &pme->cfftgridA,
                             pme->mpi_comm_d,
-                            pme->overlap[0].s2g0,pme->overlap[1].s2g0,
-                            bReproducible,pme->nthread);
+                            pme->overlap[0].s2g0, pme->overlap[1].s2g0,
+                            bReproducible, pme->nthread);
 
     if (bFreeEnergy)
     {
         pmegrids_init(&pme->pmegridB,
-                      pme->pmegrid_nx,pme->pmegrid_ny,pme->pmegrid_nz,
+                      pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
                       pme->pmegrid_nz_base,
                       pme->pme_order,
                       pme->nthread,
                       pme->nkx % pme->nnodes_major != 0,
                       pme->nky % pme->nnodes_minor != 0);
 
-        gmx_parallel_3dfft_init(&pme->pfft_setupB,ndata,
-                                &pme->fftgridB,&pme->cfftgridB,
+        gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
+                                &pme->fftgridB, &pme->cfftgridB,
                                 pme->mpi_comm_d,
-                                pme->overlap[0].s2g0,pme->overlap[1].s2g0,
-                                bReproducible,pme->nthread);
+                                pme->overlap[0].s2g0, pme->overlap[1].s2g0,
+                                bReproducible, pme->nthread);
     }
     else
     {
@@ -3261,22 +3350,23 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     if (!pme->bP3M)
     {
         /* Use plain SPME B-spline interpolation */
-        make_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
+        make_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
     }
     else
     {
         /* Use the P3M grid-optimized influence function */
-        make_p3m_bspline_moduli(pme->bsp_mod,pme->nkx,pme->nky,pme->nkz,pme->pme_order);
+        make_p3m_bspline_moduli(pme->bsp_mod, pme->nkx, pme->nky, pme->nkz, pme->pme_order);
     }
 
     /* Use atc[0] for spreading */
-    init_atomcomm(pme,&pme->atc[0],cr,nnodes_major > 1 ? 0 : 1,TRUE);
+    init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
     if (pme->ndecompdim >= 2)
     {
-        init_atomcomm(pme,&pme->atc[1],cr,1,FALSE);
+        init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
     }
 
-    if (pme->nnodes == 1) {
+    if (pme->nnodes == 1)
+    {
         pme->atc[0].n = homenr;
         pme_realloc_atomcomm_things(&pme->atc[0]);
     }
@@ -3286,23 +3376,23 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
 
         /* Use fft5d, order after FFT is y major, z, x minor */
 
-        snew(pme->work,pme->nthread);
-        for(thread=0; thread<pme->nthread; thread++)
+        snew(pme->work, pme->nthread);
+        for (thread = 0; thread < pme->nthread; thread++)
         {
-            realloc_work(&pme->work[thread],pme->nkx);
+            realloc_work(&pme->work[thread], pme->nkx);
         }
     }
 
     *pmedata = pme;
-    
+
     return 0;
 }
 
-static void reuse_pmegrids(const pmegrids_t *old,pmegrids_t *new)
+static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
 {
-    int d,t;
+    int d, t;
 
-    for(d=0; d<DIM; d++)
+    for (d = 0; d < DIM; d++)
     {
         if (new->grid.n[d] > old->grid.n[d])
         {
@@ -3316,7 +3406,7 @@ static void reuse_pmegrids(const pmegrids_t *old,pmegrids_t *new)
     if (new->nthread > 1 && new->nthread == old->nthread)
     {
         sfree_aligned(new->grid_all);
-        for(t=0; t<new->nthread; t++)
+        for (t = 0; t < new->nthread; t++)
         {
             new->grid_th[t].grid = old->grid_th[t].grid;
         }
@@ -3333,7 +3423,7 @@ int gmx_pme_reinit(gmx_pme_t *         pmedata,
     int homenr;
     int ret;
 
-    irc = *ir;
+    irc     = *ir;
     irc.nkx = grid_size[XX];
     irc.nky = grid_size[YY];
     irc.nkz = grid_size[ZZ];
@@ -3347,13 +3437,13 @@ int gmx_pme_reinit(gmx_pme_t *         pmedata,
         homenr = -1;
     }
 
-    ret = gmx_pme_init(pmedata,cr,pme_src->nnodes_major,pme_src->nnodes_minor,
-                       &irc,homenr,pme_src->bFEP,FALSE,pme_src->nthread);
+    ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
+                       &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
 
     if (ret == 0)
     {
         /* We can easily reuse the allocated pme grids in pme_src */
-        reuse_pmegrids(&pme_src->pmegridA,&(*pmedata)->pmegridA);
+        reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
         /* We would like to reuse the fft grids, but that's harder */
     }
 
@@ -3362,13 +3452,13 @@ int gmx_pme_reinit(gmx_pme_t *         pmedata,
 
 
 static void copy_local_grid(gmx_pme_t pme,
-                            pmegrids_t *pmegrids,int thread,real *fftgrid)
+                            pmegrids_t *pmegrids, int thread, real *fftgrid)
 {
-    ivec local_fft_ndata,local_fft_offset,local_fft_size;
-    int  fft_my,fft_mz;
-    int  nsx,nsy,nsz;
+    ivec local_fft_ndata, local_fft_offset, local_fft_size;
+    int  fft_my, fft_mz;
+    int  nsx, nsy, nsz;
     ivec nf;
-    int  offx,offy,offz,x,y,z,i0,i0t;
+    int  offx, offy, offz, x, y, z, i0, i0t;
     int  d;
     pmegrid_t *pmegrid;
     real *grid_th;
@@ -3386,7 +3476,7 @@ static void copy_local_grid(gmx_pme_t pme,
     nsy = pmegrid->s[YY];
     nsz = pmegrid->s[ZZ];
 
-    for(d=0; d<DIM; d++)
+    for (d = 0; d < DIM; d++)
     {
         nf[d] = min(pmegrid->n[d] - (pmegrid->order - 1),
                     local_fft_ndata[d] - pmegrid->offset[d]);
@@ -3400,13 +3490,13 @@ static void copy_local_grid(gmx_pme_t pme,
      * This also initializes the full grid.
      */
     grid_th = pmegrid->grid;
-    for(x=0; x<nf[XX]; x++)
+    for (x = 0; x < nf[XX]; x++)
     {
-        for(y=0; y<nf[YY]; y++)
+        for (y = 0; y < nf[YY]; y++)
         {
             i0  = ((offx + x)*fft_my + (offy + y))*fft_mz + offz;
             i0t = (x*nsy + y)*nsz;
-            for(z=0; z<nf[ZZ]; z++)
+            for (z = 0; z < nf[ZZ]; z++)
             {
                 fftgrid[i0+z] = grid_th[i0t+z];
             }
@@ -3416,24 +3506,24 @@ static void copy_local_grid(gmx_pme_t pme,
 
 static void
 reduce_threadgrid_overlap(gmx_pme_t pme,
-                          const pmegrids_t *pmegrids,int thread,
-                          real *fftgrid,real *commbuf_x,real *commbuf_y)
+                          const pmegrids_t *pmegrids, int thread,
+                          real *fftgrid, real *commbuf_x, real *commbuf_y)
 {
-    ivec local_fft_ndata,local_fft_offset,local_fft_size;
-    int  fft_nx,fft_ny,fft_nz;
-    int  fft_my,fft_mz;
-    int  buf_my=-1;
-    int  nsx,nsy,nsz;
+    ivec local_fft_ndata, local_fft_offset, local_fft_size;
+    int  fft_nx, fft_ny, fft_nz;
+    int  fft_my, fft_mz;
+    int  buf_my = -1;
+    int  nsx, nsy, nsz;
     ivec ne;
-    int  offx,offy,offz,x,y,z,i0,i0t;
-    int  sx,sy,sz,fx,fy,fz,tx1,ty1,tz1,ox,oy,oz;
-    gmx_bool bClearBufX,bClearBufY,bClearBufXY,bClearBuf;
-    gmx_bool bCommX,bCommY;
+    int  offx, offy, offz, x, y, z, i0, i0t;
+    int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
+    gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
+    gmx_bool bCommX, bCommY;
     int  d;
     int  thread_f;
-    const pmegrid_t *pmegrid,*pmegrid_g,*pmegrid_f;
+    const pmegrid_t *pmegrid, *pmegrid_g, *pmegrid_f;
     const real *grid_th;
-    real *commbuf=NULL;
+    real *commbuf = NULL;
 
     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
                                    local_fft_ndata,
@@ -3458,7 +3548,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
      */
     pmegrid = &pmegrids->grid_th[thread];
 
-    for(d=0; d<DIM; d++)
+    for (d = 0; d < DIM; d++)
     {
         ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
                     local_fft_ndata[d]);
@@ -3479,49 +3569,51 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
     /* Note that ffy_nx/y is equal to the number of grid points
      * between the first point of our node grid and the one of the next node.
      */
-    for(sx=0; sx>=-pmegrids->nthread_comm[XX]; sx--)
+    for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
     {
-        fx = pmegrid->ci[XX] + sx;
-        ox = 0;
+        fx     = pmegrid->ci[XX] + sx;
+        ox     = 0;
         bCommX = FALSE;
-        if (fx < 0) {
-            fx += pmegrids->nc[XX];
-            ox -= fft_nx;
+        if (fx < 0)
+        {
+            fx    += pmegrids->nc[XX];
+            ox    -= fft_nx;
             bCommX = (pme->nnodes_major > 1);
         }
         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
-        ox += pmegrid_g->offset[XX];
+        ox       += pmegrid_g->offset[XX];
         if (!bCommX)
         {
-            tx1 = min(ox + pmegrid_g->n[XX],ne[XX]);
+            tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
         }
         else
         {
-            tx1 = min(ox + pmegrid_g->n[XX],pme->pme_order);
+            tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
         }
 
-        for(sy=0; sy>=-pmegrids->nthread_comm[YY]; sy--)
+        for (sy = 0; sy >= -pmegrids->nthread_comm[YY]; sy--)
         {
-            fy = pmegrid->ci[YY] + sy;
-            oy = 0;
+            fy     = pmegrid->ci[YY] + sy;
+            oy     = 0;
             bCommY = FALSE;
-            if (fy < 0) {
-                fy += pmegrids->nc[YY];
-                oy -= fft_ny;
+            if (fy < 0)
+            {
+                fy    += pmegrids->nc[YY];
+                oy    -= fft_ny;
                 bCommY = (pme->nnodes_minor > 1);
             }
             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
-            oy += pmegrid_g->offset[YY];
+            oy       += pmegrid_g->offset[YY];
             if (!bCommY)
             {
-                ty1 = min(oy + pmegrid_g->n[YY],ne[YY]);
+                ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
             }
             else
             {
-                ty1 = min(oy + pmegrid_g->n[YY],pme->pme_order);
+                ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
             }
 
-            for(sz=0; sz>=-pmegrids->nthread_comm[ZZ]; sz--)
+            for (sz = 0; sz >= -pmegrids->nthread_comm[ZZ]; sz--)
             {
                 fz = pmegrid->ci[ZZ] + sz;
                 oz = 0;
@@ -3531,8 +3623,8 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                     oz -= fft_nz;
                 }
                 pmegrid_g = &pmegrids->grid_th[fz];
-                oz += pmegrid_g->offset[ZZ];
-                tz1 = min(oz + pmegrid_g->n[ZZ],ne[ZZ]);
+                oz       += pmegrid_g->offset[ZZ];
+                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
 
                 if (sx == 0 && sy == 0 && sz == 0)
                 {
@@ -3554,26 +3646,26 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 
 #ifdef DEBUG_PME_REDUCE
                 printf("n%d t%d add %d  %2d %2d %2d  %2d %2d %2d  %2d-%2d %2d-%2d, %2d-%2d %2d-%2d, %2d-%2d %2d-%2d\n",
-                       pme->nodeid,thread,thread_f,
+                       pme->nodeid, thread, thread_f,
                        pme->pmegrid_start_ix,
                        pme->pmegrid_start_iy,
                        pme->pmegrid_start_iz,
-                       sx,sy,sz,
-                       offx-ox,tx1-ox,offx,tx1,
-                       offy-oy,ty1-oy,offy,ty1,
-                       offz-oz,tz1-oz,offz,tz1);
+                       sx, sy, sz,
+                       offx-ox, tx1-ox, offx, tx1,
+                       offy-oy, ty1-oy, offy, ty1,
+                       offz-oz, tz1-oz, offz, tz1);
 #endif
 
                 if (!(bCommX || bCommY))
                 {
                     /* Copy from the thread local grid to the node grid */
-                    for(x=offx; x<tx1; x++)
+                    for (x = offx; x < tx1; x++)
                     {
-                        for(y=offy; y<ty1; y++)
+                        for (y = offy; y < ty1; y++)
                         {
                             i0  = (x*fft_my + y)*fft_mz;
                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
-                            for(z=offz; z<tz1; z++)
+                            for (z = offz; z < tz1; z++)
                             {
                                 fftgrid[i0+z] += grid_th[i0t+z];
                             }
@@ -3594,7 +3686,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                             /* We index commbuf modulo the local grid size */
                             commbuf += buf_my*fft_nx*fft_nz;
 
-                            bClearBuf  = bClearBufXY;
+                            bClearBuf   = bClearBufXY;
                             bClearBufXY = FALSE;
                         }
                         else
@@ -3605,16 +3697,16 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                     }
                     else
                     {
-                        commbuf = commbuf_x;
-                        buf_my  = fft_ny;
+                        commbuf    = commbuf_x;
+                        buf_my     = fft_ny;
                         bClearBuf  = bClearBufX;
                         bClearBufX = FALSE;
                     }
 
                     /* Copy to the communication buffer */
-                    for(x=offx; x<tx1; x++)
+                    for (x = offx; x < tx1; x++)
                     {
-                        for(y=offy; y<ty1; y++)
+                        for (y = offy; y < ty1; y++)
                         {
                             i0  = (x*buf_my + y)*fft_nz;
                             i0t = ((x - ox)*nsy + (y - oy))*nsz - oz;
@@ -3622,14 +3714,14 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                             if (bClearBuf)
                             {
                                 /* First access of commbuf, initialize it */
-                                for(z=offz; z<tz1; z++)
+                                for (z = offz; z < tz1; z++)
                                 {
                                     commbuf[i0+z]  = grid_th[i0t+z];
                                 }
                             }
                             else
                             {
-                                for(z=offz; z<tz1; z++)
+                                for (z = offz; z < tz1; z++)
                                 {
                                     commbuf[i0+z] += grid_th[i0t+z];
                                 }
@@ -3643,19 +3735,19 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 }
 
 
-static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
+static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
 {
-    ivec local_fft_ndata,local_fft_offset,local_fft_size;
+    ivec local_fft_ndata, local_fft_offset, local_fft_size;
     pme_overlap_t *overlap;
-    int  send_index0,send_nindex;
+    int  send_index0, send_nindex;
     int  recv_nindex;
 #ifdef GMX_MPI
     MPI_Status stat;
 #endif
-    int  send_size_y,recv_size_y;
-    int  ipulse,send_id,recv_id,datasize,gridsize,size_yx;
-    real *sendptr,*recvptr;
-    int  x,y,z,indg,indb;
+    int  send_size_y, recv_size_y;
+    int  ipulse, send_id, recv_id, datasize, gridsize, size_yx;
+    real *sendptr, *recvptr;
+    int  x, y, z, indg, indb;
 
     /* Note that this routine is only used for forward communication.
      * Since the force gathering, unlike the charge spreading,
@@ -3676,7 +3768,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 
         if (pme->nnodes_major > 1)
         {
-             size_yx = pme->overlap[0].comm_data[0].send_nindex;
+            size_yx = pme->overlap[0].comm_data[0].send_nindex;
         }
         else
         {
@@ -3686,10 +3778,10 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 
         send_size_y = overlap->send_size;
 
-        for(ipulse=0;ipulse<overlap->noverlap_nodes;ipulse++)
+        for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
         {
-            send_id = overlap->send_id[ipulse];
-            recv_id = overlap->recv_id[ipulse];
+            send_id       = overlap->send_id[ipulse];
+            recv_id       = overlap->recv_id[ipulse];
             send_index0   =
                 overlap->comm_data[ipulse].send_index0 -
                 overlap->comm_data[0].send_index0;
@@ -3702,20 +3794,20 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
             recvptr = overlap->recvbuf;
 
 #ifdef GMX_MPI
-            MPI_Sendrecv(sendptr,send_size_y*datasize,GMX_MPI_REAL,
-                         send_id,ipulse,
-                         recvptr,recv_size_y*datasize,GMX_MPI_REAL,
-                         recv_id,ipulse,
-                         overlap->mpi_comm,&stat);
+            MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
+                         send_id, ipulse,
+                         recvptr, recv_size_y*datasize, GMX_MPI_REAL,
+                         recv_id, ipulse,
+                         overlap->mpi_comm, &stat);
 #endif
 
-            for(x=0; x<local_fft_ndata[XX]; x++)
+            for (x = 0; x < local_fft_ndata[XX]; x++)
             {
-                for(y=0; y<recv_nindex; y++)
+                for (y = 0; y < recv_nindex; y++)
                 {
                     indg = (x*local_fft_size[YY] + y)*local_fft_size[ZZ];
                     indb = (x*recv_size_y        + y)*local_fft_ndata[ZZ];
-                    for(z=0; z<local_fft_ndata[ZZ]; z++)
+                    for (z = 0; z < local_fft_ndata[ZZ]; z++)
                     {
                         fftgrid[indg+z] += recvptr[indb+z];
                     }
@@ -3726,13 +3818,13 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
             {
                 /* Copy from the received buffer to the send buffer for dim 0 */
                 sendptr = pme->overlap[0].sendbuf;
-                for(x=0; x<size_yx; x++)
+                for (x = 0; x < size_yx; x++)
                 {
-                    for(y=0; y<recv_nindex; y++)
+                    for (y = 0; y < recv_nindex; y++)
                     {
                         indg = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
                         indb = ((local_fft_ndata[XX] + x)*recv_size_y + y)*local_fft_ndata[ZZ];
-                        for(z=0; z<local_fft_ndata[ZZ]; z++)
+                        for (z = 0; z < local_fft_ndata[ZZ]; z++)
                         {
                             sendptr[indg+z] += recvptr[indb+z];
                         }
@@ -3756,8 +3848,8 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 
         ipulse = 0;
 
-        send_id = overlap->send_id[ipulse];
-        recv_id = overlap->recv_id[ipulse];
+        send_id       = overlap->send_id[ipulse];
+        recv_id       = overlap->recv_id[ipulse];
         send_nindex   = overlap->comm_data[ipulse].send_nindex;
         /* We don't use recv_index0, as we always receive starting at 0 */
         recv_nindex   = overlap->comm_data[ipulse].recv_nindex;
@@ -3767,25 +3859,25 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 
         if (debug != NULL)
         {
-            fprintf(debug,"PME fftgrid comm %2d x %2d x %2d\n",
-                   send_nindex,local_fft_ndata[YY],local_fft_ndata[ZZ]);
+            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+                    send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
         }
 
 #ifdef GMX_MPI
-        MPI_Sendrecv(sendptr,send_nindex*datasize,GMX_MPI_REAL,
-                     send_id,ipulse,
-                     recvptr,recv_nindex*datasize,GMX_MPI_REAL,
-                     recv_id,ipulse,
-                     overlap->mpi_comm,&stat);
+        MPI_Sendrecv(sendptr, send_nindex*datasize, GMX_MPI_REAL,
+                     send_id, ipulse,
+                     recvptr, recv_nindex*datasize, GMX_MPI_REAL,
+                     recv_id, ipulse,
+                     overlap->mpi_comm, &stat);
 #endif
 
-        for(x=0; x<recv_nindex; x++)
+        for (x = 0; x < recv_nindex; x++)
         {
-            for(y=0; y<local_fft_ndata[YY]; y++)
+            for (y = 0; y < local_fft_ndata[YY]; y++)
             {
                 indg = (x*local_fft_size[YY]  + y)*local_fft_size[ZZ];
                 indb = (x*local_fft_ndata[YY] + y)*local_fft_ndata[ZZ];
-                for(z=0; z<local_fft_ndata[ZZ]; z++)
+                for (z = 0; z < local_fft_ndata[ZZ]; z++)
                 {
                     fftgrid[indg+z] += recvptr[indb+z];
                 }
@@ -3796,20 +3888,20 @@ static void sum_fftgrid_dd(gmx_pme_t pme,real *fftgrid)
 
 
 static void spread_on_grid(gmx_pme_t pme,
-                           pme_atomcomm_t *atc,pmegrids_t *grids,
-                           gmx_bool bCalcSplines,gmx_bool bSpread,
+                           pme_atomcomm_t *atc, pmegrids_t *grids,
+                           gmx_bool bCalcSplines, gmx_bool bSpread,
                            real *fftgrid)
 {
-    int nthread,thread;
+    int nthread, thread;
 #ifdef PME_TIME_THREADS
-    gmx_cycles_t c1,c2,c3,ct1a,ct1b,ct1c;
-    static double cs1=0,cs2=0,cs3=0;
-    static double cs1a[6]={0,0,0,0,0,0};
-    static int cnt=0;
+    gmx_cycles_t c1, c2, c3, ct1a, ct1b, ct1c;
+    static double cs1     = 0, cs2 = 0, cs3 = 0;
+    static double cs1a[6] = {0, 0, 0, 0, 0, 0};
+    static int cnt        = 0;
 #endif
 
     nthread = pme->nthread;
-    assert(nthread>0);
+    assert(nthread > 0);
 
 #ifdef PME_TIME_THREADS
     c1 = omp_cyc_start();
@@ -3817,9 +3909,9 @@ static void spread_on_grid(gmx_pme_t pme,
     if (bCalcSplines)
     {
 #pragma omp parallel for num_threads(nthread) schedule(static)
-        for(thread=0; thread<nthread; thread++)
+        for (thread = 0; thread < nthread; thread++)
         {
-            int start,end;
+            int start, end;
 
             start = atc->n* thread   /nthread;
             end   = atc->n*(thread+1)/nthread;
@@ -3827,11 +3919,11 @@ static void spread_on_grid(gmx_pme_t pme,
             /* Compute fftgrid index for all atoms,
              * with help of some extra variables.
              */
-            calc_interpolation_idx(pme,atc,start,end,thread);
+            calc_interpolation_idx(pme, atc, start, end, thread);
         }
     }
 #ifdef PME_TIME_THREADS
-    c1 = omp_cyc_end(c1);
+    c1   = omp_cyc_end(c1);
     cs1 += (double)c1;
 #endif
 
@@ -3839,7 +3931,7 @@ static void spread_on_grid(gmx_pme_t pme,
     c2 = omp_cyc_start();
 #endif
 #pragma omp parallel for num_threads(nthread) schedule(static)
-    for(thread=0; thread<nthread; thread++)
+    for (thread = 0; thread < nthread; thread++)
     {
         splinedata_t *spline;
         pmegrid_t *grid;
@@ -3857,15 +3949,15 @@ static void spread_on_grid(gmx_pme_t pme,
         {
             spline = &atc->spline[thread];
 
-            make_thread_local_ind(atc,thread,spline);
+            make_thread_local_ind(atc, thread, spline);
 
             grid = &grids->grid_th[thread];
         }
 
         if (bCalcSplines)
         {
-            make_bsplines(spline->theta,spline->dtheta,pme->pme_order,
-                          atc->fractx,spline->n,spline->ind,atc->q,pme->bFEP);
+            make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
+                          atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
         }
 
         if (bSpread)
@@ -3874,20 +3966,20 @@ static void spread_on_grid(gmx_pme_t pme,
 #ifdef PME_TIME_SPREAD
             ct1a = omp_cyc_start();
 #endif
-            spread_q_bsplines_thread(grid,atc,spline,pme->spline_work);
+            spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
 
             if (grids->nthread > 1)
             {
-                copy_local_grid(pme,grids,thread,fftgrid);
+                copy_local_grid(pme, grids, thread, fftgrid);
             }
 #ifdef PME_TIME_SPREAD
-            ct1a = omp_cyc_end(ct1a);
+            ct1a          = omp_cyc_end(ct1a);
             cs1a[thread] += (double)ct1a;
 #endif
         }
     }
 #ifdef PME_TIME_THREADS
-    c2 = omp_cyc_end(c2);
+    c2   = omp_cyc_end(c2);
     cs2 += (double)c2;
 #endif
 
@@ -3897,22 +3989,22 @@ static void spread_on_grid(gmx_pme_t pme,
         c3 = omp_cyc_start();
 #endif
 #pragma omp parallel for num_threads(grids->nthread) schedule(static)
-        for(thread=0; thread<grids->nthread; thread++)
+        for (thread = 0; thread < grids->nthread; thread++)
         {
-            reduce_threadgrid_overlap(pme,grids,thread,
+            reduce_threadgrid_overlap(pme, grids, thread,
                                       fftgrid,
                                       pme->overlap[0].sendbuf,
                                       pme->overlap[1].sendbuf);
         }
 #ifdef PME_TIME_THREADS
-        c3 = omp_cyc_end(c3);
+        c3   = omp_cyc_end(c3);
         cs3 += (double)c3;
 #endif
 
         if (pme->nnodes > 1)
         {
             /* Communicate the overlapping part of the fftgrid */
-            sum_fftgrid_dd(pme,fftgrid);
+            sum_fftgrid_dd(pme, fftgrid);
         }
     }
 
@@ -3921,10 +4013,12 @@ static void spread_on_grid(gmx_pme_t pme,
     if (cnt % 20 == 0)
     {
         printf("idx %.2f spread %.2f red %.2f",
-               cs1*1e-9,cs2*1e-9,cs3*1e-9);
+               cs1*1e-9, cs2*1e-9, cs3*1e-9);
 #ifdef PME_TIME_SPREAD
-        for(thread=0; thread<nthread; thread++)
-            printf(" %.2f",cs1a[thread]*1e-9);
+        for (thread = 0; thread < nthread; thread++)
+        {
+            printf(" %.2f", cs1a[thread]*1e-9);
+        }
 #endif
         printf("\n");
     }
@@ -3933,27 +4027,27 @@ static void spread_on_grid(gmx_pme_t pme,
 
 
 static void dump_grid(FILE *fp,
-                      int sx,int sy,int sz,int nx,int ny,int nz,
-                      int my,int mz,const real *g)
+                      int sx, int sy, int sz, int nx, int ny, int nz,
+                      int my, int mz, const real *g)
 {
-    int x,y,z;
+    int x, y, z;
 
-    for(x=0; x<nx; x++)
+    for (x = 0; x < nx; x++)
     {
-        for(y=0; y<ny; y++)
+        for (y = 0; y < ny; y++)
         {
-            for(z=0; z<nz; z++)
+            for (z = 0; z < nz; z++)
             {
-                fprintf(fp,"%2d %2d %2d %6.3f\n",
-                        sx+x,sy+y,sz+z,g[(x*my + y)*mz + z]);
+                fprintf(fp, "%2d %2d %2d %6.3f\n",
+                        sx+x, sy+y, sz+z, g[(x*my + y)*mz + z]);
             }
         }
     }
 }
 
-static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
+static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
 {
-    ivec local_fft_ndata,local_fft_offset,local_fft_size;
+    ivec local_fft_ndata, local_fft_offset, local_fft_size;
 
     gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
                                    local_fft_ndata,
@@ -3973,7 +4067,7 @@ static void dump_local_fftgrid(gmx_pme_t pme,const real *fftgrid)
 }
 
 
-void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
+void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
 {
     pme_atomcomm_t *atc;
     pmegrids_t *grid;
@@ -3987,11 +4081,11 @@ void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
         gmx_incons("gmx_pme_calc_energy with free energy");
     }
 
-    atc = &pme->atc_energy;
+    atc            = &pme->atc_energy;
     atc->nthread   = 1;
     if (atc->spline == NULL)
     {
-        snew(atc->spline,atc->nthread);
+        snew(atc->spline, atc->nthread);
     }
     atc->nslab     = 1;
     atc->bSpread   = TRUE;
@@ -4004,22 +4098,22 @@ void gmx_pme_calc_energy(gmx_pme_t pme,int n,rvec *x,real *q,real *V)
     /* We only use the A-charges grid */
     grid = &pme->pmegridA;
 
-    spread_on_grid(pme,atc,NULL,TRUE,FALSE,pme->fftgridA);
+    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
 
-    *V = gather_energy_bsplines(pme,grid->grid.grid,atc);
+    *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 }
 
 
-static void reset_pmeonly_counters(t_commrec *cr,gmx_wallcycle_t wcycle,
-        t_nrnb *nrnb,t_inputrec *ir, gmx_large_int_t step_rel)
+static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
+                                   t_nrnb *nrnb, t_inputrec *ir, gmx_large_int_t step_rel)
 {
     /* Reset all the counters related to performance over the run */
-    wallcycle_stop(wcycle,ewcRUN);
+    wallcycle_stop(wcycle, ewcRUN);
     wallcycle_reset_all(wcycle);
     init_nrnb(nrnb);
     ir->init_step += step_rel;
     ir->nsteps    -= step_rel;
-    wallcycle_start(wcycle,ewcRUN);
+    wallcycle_start(wcycle, ewcRUN);
 }
 
 
@@ -4048,10 +4142,10 @@ static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
     }
 
     (*npmedata)++;
-    srenew(*pmedata,*npmedata);
+    srenew(*pmedata, *npmedata);
 
     /* Generate a new PME data structure, copying part of the old pointers */
-    gmx_pme_reinit(&((*pmedata)[ind]),cr,pme,ir,grid_size);
+    gmx_pme_reinit(&((*pmedata)[ind]), cr, pme, ir, grid_size);
 
     *pme_ret = (*pmedata)[ind];
 }
@@ -4068,21 +4162,21 @@ int gmx_pmeonly(gmx_pme_t pme,
     gmx_pme_pp_t pme_pp;
     int  natoms;
     matrix box;
-    rvec *x_pp=NULL,*f_pp=NULL;
-    real *chargeA=NULL,*chargeB=NULL;
-    real lambda=0;
-    int  maxshift_x=0,maxshift_y=0;
-    real energy,dvdlambda;
+    rvec *x_pp      = NULL, *f_pp = NULL;
+    real *chargeA   = NULL, *chargeB = NULL;
+    real lambda     = 0;
+    int  maxshift_x = 0, maxshift_y = 0;
+    real energy, dvdlambda;
     matrix vir;
     float cycles;
     int  count;
     gmx_bool bEnerVir;
-    gmx_large_int_t step,step_rel;
+    gmx_large_int_t step, step_rel;
     ivec grid_switch;
 
     /* This data will only use with PME tuning, i.e. switching PME grids */
     npmedata = 1;
-    snew(pmedata,npmedata);
+    snew(pmedata, npmedata);
     pmedata[0] = pme;
 
     pme_pp = gmx_pme_pp_init(cr);
@@ -4097,17 +4191,17 @@ int gmx_pmeonly(gmx_pme_t pme,
         {
             /* Domain decomposition */
             natoms = gmx_pme_recv_q_x(pme_pp,
-                                      &chargeA,&chargeB,box,&x_pp,&f_pp,
-                                      &maxshift_x,&maxshift_y,
-                                      &pme->bFEP,&lambda,
+                                      &chargeA, &chargeB, box, &x_pp, &f_pp,
+                                      &maxshift_x, &maxshift_y,
+                                      &pme->bFEP, &lambda,
                                       &bEnerVir,
                                       &step,
-                                      grid_switch,&ewaldcoeff);
+                                      grid_switch, &ewaldcoeff);
 
             if (natoms == -2)
             {
                 /* Switch the PME grid to grid_switch */
-                gmx_pmeonly_switch(&npmedata,&pmedata,grid_switch,cr,ir,&pme);
+                gmx_pmeonly_switch(&npmedata, &pmedata, grid_switch, cr, ir, &pme);
             }
         }
         while (natoms == -2);
@@ -4121,21 +4215,23 @@ int gmx_pmeonly(gmx_pme_t pme,
         step_rel = step - ir->init_step;
 
         if (count == 0)
-            wallcycle_start(wcycle,ewcRUN);
+        {
+            wallcycle_start(wcycle, ewcRUN);
+        }
 
-        wallcycle_start(wcycle,ewcPMEMESH);
+        wallcycle_start(wcycle, ewcPMEMESH);
 
         dvdlambda = 0;
         clear_mat(vir);
-        gmx_pme_do(pme,0,natoms,x_pp,f_pp,chargeA,chargeB,box,
-                   cr,maxshift_x,maxshift_y,nrnb,wcycle,vir,ewaldcoeff,
-                   &energy,lambda,&dvdlambda,
+        gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
+                   cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
+                   &energy, lambda, &dvdlambda,
                    GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 
-        cycles = wallcycle_stop(wcycle,ewcPMEMESH);
+        cycles = wallcycle_stop(wcycle, ewcPMEMESH);
 
         gmx_pme_send_force_vir_ener(pme_pp,
-                                    f_pp,vir,energy,dvdlambda,
+                                    f_pp, vir, energy, dvdlambda,
                                     cycles);
 
         count++;
@@ -4143,7 +4239,7 @@ int gmx_pmeonly(gmx_pme_t pme,
         if (step_rel == wcycle_get_reset_counters(wcycle))
         {
             /* Reset all the counters related to performance over the run */
-            reset_pmeonly_counters(cr,wcycle,nrnb,ir,step_rel);
+            reset_pmeonly_counters(cr, wcycle, nrnb, ir, step_rel);
             wcycle_set_reset_counters(wcycle, 0);
         }
 
@@ -4164,15 +4260,15 @@ int gmx_pme_do(gmx_pme_t pme,
                real *energy,    real lambda,
                real *dvdlambda, int flags)
 {
-    int     q,d,i,j,ntot,npme;
-    int     nx,ny,nz;
-    int     n_d,local_ny;
-    pme_atomcomm_t *atc=NULL;
-    pmegrids_t *pmegrid=NULL;
-    real    *grid=NULL;
+    int     q, d, i, j, ntot, npme;
+    int     nx, ny, nz;
+    int     n_d, local_ny;
+    pme_atomcomm_t *atc = NULL;
+    pmegrids_t *pmegrid = NULL;
+    real    *grid       = NULL;
     real    *ptr;
-    rvec    *x_d,*f_d;
-    real    *charge=NULL,*q_d;
+    rvec    *x_d, *f_d;
+    real    *charge = NULL, *q_d;
     real    energy_AB[2];
     matrix  vir_AB[2];
     gmx_bool bClearF;
@@ -4181,19 +4277,21 @@ int gmx_pme_do(gmx_pme_t pme,
     t_complex * cfftgrid;
     int     thread;
     const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
-    const gmx_bool bCalcF = flags & GMX_PME_CALC_F;
+    const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
 
     assert(pme->nnodes > 0);
     assert(pme->nnodes == 1 || pme->ndecompdim > 0);
 
-    if (pme->nnodes > 1) {
-        atc = &pme->atc[0];
+    if (pme->nnodes > 1)
+    {
+        atc      = &pme->atc[0];
         atc->npd = homenr;
-        if (atc->npd > atc->pd_nalloc) {
+        if (atc->npd > atc->pd_nalloc)
+        {
             atc->pd_nalloc = over_alloc_dd(atc->npd);
-            srenew(atc->pd,atc->pd_nalloc);
+            srenew(atc->pd, atc->pd_nalloc);
         }
-        atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
+        atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
     }
     else
     {
@@ -4201,45 +4299,56 @@ int gmx_pme_do(gmx_pme_t pme,
         pme->atc[0].n = homenr;
     }
 
-    for(q=0; q<(pme->bFEP ? 2 : 1); q++) {
-        if (q == 0) {
-            pmegrid = &pme->pmegridA;
-            fftgrid = pme->fftgridA;
-            cfftgrid = pme->cfftgridA;
+    for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
+    {
+        if (q == 0)
+        {
+            pmegrid    = &pme->pmegridA;
+            fftgrid    = pme->fftgridA;
+            cfftgrid   = pme->cfftgridA;
             pfft_setup = pme->pfft_setupA;
-            charge = chargeA+start;
-        } else {
-            pmegrid = &pme->pmegridB;
-            fftgrid = pme->fftgridB;
-            cfftgrid = pme->cfftgridB;
+            charge     = chargeA+start;
+        }
+        else
+        {
+            pmegrid    = &pme->pmegridB;
+            fftgrid    = pme->fftgridB;
+            cfftgrid   = pme->cfftgridB;
             pfft_setup = pme->pfft_setupB;
-            charge = chargeB+start;
+            charge     = chargeB+start;
         }
         grid = pmegrid->grid.grid;
         /* Unpack structure */
-        if (debug) {
-            fprintf(debug,"PME: nnodes = %d, nodeid = %d\n",
-                    cr->nnodes,cr->nodeid);
-            fprintf(debug,"Grid = %p\n",(void*)grid);
+        if (debug)
+        {
+            fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+                    cr->nnodes, cr->nodeid);
+            fprintf(debug, "Grid = %p\n", (void*)grid);
             if (grid == NULL)
-                gmx_fatal(FARGS,"No grid!");
+            {
+                gmx_fatal(FARGS, "No grid!");
+            }
         }
         where();
 
-        m_inv_ur0(box,pme->recipbox);
+        m_inv_ur0(box, pme->recipbox);
 
-        if (pme->nnodes == 1) {
+        if (pme->nnodes == 1)
+        {
             atc = &pme->atc[0];
-            if (DOMAINDECOMP(cr)) {
+            if (DOMAINDECOMP(cr))
+            {
                 atc->n = homenr;
                 pme_realloc_atomcomm_things(atc);
             }
             atc->x = x;
             atc->q = charge;
             atc->f = f;
-        } else {
-            wallcycle_start(wcycle,ewcPME_REDISTXF);
-            for(d=pme->ndecompdim-1; d>=0; d--)
+        }
+        else
+        {
+            wallcycle_start(wcycle, ewcPME_REDISTXF);
+            for (d = pme->ndecompdim-1; d >= 0; d--)
             {
                 if (d == pme->ndecompdim-1)
                 {
@@ -4253,74 +4362,80 @@ int gmx_pme_do(gmx_pme_t pme,
                     x_d = atc->x;
                     q_d = atc->q;
                 }
-                atc = &pme->atc[d];
+                atc      = &pme->atc[d];
                 atc->npd = n_d;
-                if (atc->npd > atc->pd_nalloc) {
+                if (atc->npd > atc->pd_nalloc)
+                {
                     atc->pd_nalloc = over_alloc_dd(atc->npd);
-                    srenew(atc->pd,atc->pd_nalloc);
+                    srenew(atc->pd, atc->pd_nalloc);
                 }
-                atc->maxshift = (atc->dimind==0 ? maxshift_x : maxshift_y);
-                pme_calc_pidx_wrapper(n_d,pme->recipbox,x_d,atc);
+                atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+                pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
                 where();
 
                 /* Redistribute x (only once) and qA or qB */
-                if (DOMAINDECOMP(cr)) {
-                    dd_pmeredist_x_q(pme, n_d, q==0, x_d, q_d, atc);
-                } else {
-                    pmeredist_pd(pme, TRUE, n_d, q==0, x_d, q_d, atc);
+                if (DOMAINDECOMP(cr))
+                {
+                    dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
+                }
+                else
+                {
+                    pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
                 }
             }
             where();
 
-            wallcycle_stop(wcycle,ewcPME_REDISTXF);
+            wallcycle_stop(wcycle, ewcPME_REDISTXF);
         }
 
         if (debug)
-            fprintf(debug,"Node= %6d, pme local particles=%6d\n",
-                    cr->nodeid,atc->n);
+        {
+            fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+                    cr->nodeid, atc->n);
+        }
 
         if (flags & GMX_PME_SPREAD_Q)
         {
-            wallcycle_start(wcycle,ewcPME_SPREADGATHER);
+            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 
             /* Spread the charges on a grid */
-            spread_on_grid(pme,&pme->atc[0],pmegrid,q==0,TRUE,fftgrid);
+            spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
 
             if (q == 0)
             {
-                inc_nrnb(nrnb,eNR_WEIGHTS,DIM*atc->n);
+                inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
             }
-            inc_nrnb(nrnb,eNR_SPREADQBSP,
+            inc_nrnb(nrnb, eNR_SPREADQBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 
             if (pme->nthread == 1)
             {
-                wrap_periodic_pmegrid(pme,grid);
+                wrap_periodic_pmegrid(pme, grid);
 
                 /* sum contributions to local grid from other nodes */
 #ifdef GMX_MPI
                 if (pme->nnodes > 1)
                 {
-                    gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_FORWARD);
+                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
                     where();
                 }
 #endif
 
-                copy_pmegrid_to_fftgrid(pme,grid,fftgrid);
+                copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
             }
 
-            wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
+            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 
             /*
-            dump_local_fftgrid(pme,fftgrid);
-            exit(0);
-            */
+               dump_local_fftgrid(pme,fftgrid);
+               exit(0);
+             */
         }
 
         /* Here we start a large thread parallel region */
 #pragma omp parallel num_threads(pme->nthread) private(thread)
         {
-            thread=gmx_omp_get_thread_num();
+            thread = gmx_omp_get_thread_num();
             if (flags & GMX_PME_SOLVE)
             {
                 int loop_count;
@@ -4328,31 +4443,31 @@ int gmx_pme_do(gmx_pme_t pme,
                 /* do 3d-fft */
                 if (thread == 0)
                 {
-                    wallcycle_start(wcycle,ewcPME_FFT);
+                    wallcycle_start(wcycle, ewcPME_FFT);
                 }
-                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_REAL_TO_COMPLEX,
-                                           fftgrid,cfftgrid,thread,wcycle);
+                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+                                           fftgrid, cfftgrid, thread, wcycle);
                 if (thread == 0)
                 {
-                    wallcycle_stop(wcycle,ewcPME_FFT);
+                    wallcycle_stop(wcycle, ewcPME_FFT);
                 }
                 where();
 
                 /* solve in k-space for our local cells */
                 if (thread == 0)
                 {
-                    wallcycle_start(wcycle,ewcPME_SOLVE);
+                    wallcycle_start(wcycle, ewcPME_SOLVE);
                 }
                 loop_count =
-                    solve_pme_yzx(pme,cfftgrid,ewaldcoeff,
+                    solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
                                   box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
                                   bCalcEnerVir,
-                                  pme->nthread,thread);
+                                  pme->nthread, thread);
                 if (thread == 0)
                 {
-                    wallcycle_stop(wcycle,ewcPME_SOLVE);
+                    wallcycle_stop(wcycle, ewcPME_SOLVE);
                     where();
-                    inc_nrnb(nrnb,eNR_SOLVEPME,loop_count);
+                    inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
                 }
             }
 
@@ -4362,27 +4477,27 @@ int gmx_pme_do(gmx_pme_t pme,
                 if (thread == 0)
                 {
                     where();
-                    wallcycle_start(wcycle,ewcPME_FFT);
+                    wallcycle_start(wcycle, ewcPME_FFT);
                 }
-                gmx_parallel_3dfft_execute(pfft_setup,GMX_FFT_COMPLEX_TO_REAL,
-                                           cfftgrid,fftgrid,thread,wcycle);
+                gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+                                           cfftgrid, fftgrid, thread, wcycle);
                 if (thread == 0)
                 {
-                    wallcycle_stop(wcycle,ewcPME_FFT);
-                    
+                    wallcycle_stop(wcycle, ewcPME_FFT);
+
                     where();
 
                     if (pme->nodeid == 0)
                     {
-                        ntot = pme->nkx*pme->nky*pme->nkz;
+                        ntot  = pme->nkx*pme->nky*pme->nkz;
                         npme  = ntot*log((real)ntot)/log(2.0);
-                        inc_nrnb(nrnb,eNR_FFT,2*npme);
+                        inc_nrnb(nrnb, eNR_FFT, 2*npme);
                     }
 
-                    wallcycle_start(wcycle,ewcPME_SPREADGATHER);
+                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
                 }
 
-                copy_fftgrid_to_pmegrid(pme,fftgrid,grid,pme->nthread,thread);
+                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
             }
         }
         /* End of thread parallel section.
@@ -4393,13 +4508,14 @@ int gmx_pme_do(gmx_pme_t pme,
         {
             /* distribute local grid to all nodes */
 #ifdef GMX_MPI
-            if (pme->nnodes > 1) {
-                gmx_sum_qgrid_dd(pme,grid,GMX_SUM_QGRID_BACKWARD);
+            if (pme->nnodes > 1)
+            {
+                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
             }
 #endif
             where();
 
-            unwrap_periodic_pmegrid(pme,grid);
+            unwrap_periodic_pmegrid(pme, grid);
 
             /* interpolate forces for our local atoms */
 
@@ -4411,18 +4527,18 @@ int gmx_pme_do(gmx_pme_t pme,
              */
             bClearF = (q == 0 && PAR(cr));
 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
-            for(thread=0; thread<pme->nthread; thread++)
+            for (thread = 0; thread < pme->nthread; thread++)
             {
-                gather_f_bsplines(pme,grid,bClearF,atc,
+                gather_f_bsplines(pme, grid, bClearF, atc,
                                   &atc->spline[thread],
-                                  pme->bFEP ? (q==0 ? 1.0-lambda : lambda) : 1.0);
+                                  pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
             }
 
             where();
 
-            inc_nrnb(nrnb,eNR_GATHERFBSP,
+            inc_nrnb(nrnb, eNR_GATHERFBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
-            wallcycle_stop(wcycle,ewcPME_SPREADGATHER);
+            wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
         }
 
         if (bCalcEnerVir)
@@ -4430,13 +4546,14 @@ int gmx_pme_do(gmx_pme_t pme,
             /* This should only be called on the master thread
              * and after the threads have synchronized.
              */
-            get_pme_ener_vir(pme,pme->nthread,&energy_AB[q],vir_AB[q]);
+            get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
         }
     } /* of q-loop */
 
-    if (bCalcF && pme->nnodes > 1) {
-        wallcycle_start(wcycle,ewcPME_REDISTXF);
-        for(d=0; d<pme->ndecompdim; d++)
+    if (bCalcF && pme->nnodes > 1)
+    {
+        wallcycle_start(wcycle, ewcPME_REDISTXF);
+        for (d = 0; d < pme->ndecompdim; d++)
         {
             atc = &pme->atc[d];
             if (d == pme->ndecompdim - 1)
@@ -4449,31 +4566,37 @@ int gmx_pme_do(gmx_pme_t pme,
                 n_d = pme->atc[d+1].n;
                 f_d = pme->atc[d+1].f;
             }
-            if (DOMAINDECOMP(cr)) {
-                dd_pmeredist_f(pme,atc,n_d,f_d,
-                               d==pme->ndecompdim-1 && pme->bPPnode);
-            } else {
+            if (DOMAINDECOMP(cr))
+            {
+                dd_pmeredist_f(pme, atc, n_d, f_d,
+                               d == pme->ndecompdim-1 && pme->bPPnode);
+            }
+            else
+            {
                 pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
             }
         }
 
-        wallcycle_stop(wcycle,ewcPME_REDISTXF);
+        wallcycle_stop(wcycle, ewcPME_REDISTXF);
     }
     where();
 
     if (bCalcEnerVir)
     {
-        if (!pme->bFEP) {
+        if (!pme->bFEP)
+        {
             *energy = energy_AB[0];
-            m_add(vir,vir_AB[0],vir);
-        } else {
-            *energy = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
+            m_add(vir, vir_AB[0], vir);
+        }
+        else
+        {
+            *energy     = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
             *dvdlambda += energy_AB[1] - energy_AB[0];
-            for(i=0; i<DIM; i++)
+            for (i = 0; i < DIM; i++)
             {
-                for(j=0; j<DIM; j++)
+                for (j = 0; j < DIM; j++)
                 {
-                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] + 
+                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
                         lambda*vir_AB[1][i][j];
                 }
             }
@@ -4486,7 +4609,7 @@ int gmx_pme_do(gmx_pme_t pme,
 
     if (debug)
     {
-        fprintf(debug,"PME mesh energy: %g\n",*energy);
+        fprintf(debug, "PME mesh energy: %g\n", *energy);
     }
 
     return 0;