Sort all includes in src/gromacs

[alexxy/gromacs.git] / src / gromacs / mdlib / pme.c
diff --git a/src/gromacs/mdlib/pme.c b/src/gromacs/mdlib/pme.c

index ff52b5b9fe3dbe54bb993ca21d64914bbfeaa0cd..67a12ab58db8d0a8f5b4dc4a2f53f75214da1568 100644 (file)
--- a/src/gromacs/mdlib/pme.c
+++ b/src/gromacs/mdlib/pme.c
@@ -57,40 +57,44 @@
   * /Erik 001109
   */
  
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "gromacs/legacyheaders/pme.h"
  
+#include "config.h"
+
+#include <assert.h>
+#include <math.h>
  #include <stdio.h>
+#include <stdlib.h>
  #include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-#include "txtdump.h"
-#include "vec.h"
-#include "smalloc.h"
-#include "coulomb.h"
-#include "gmx_fatal.h"
-#include "pme.h"
-#include "network.h"
-#include "physics.h"
-#include "nrnb.h"
-#include "macros.h"
  
  #include "gromacs/fft/parallel_3dfft.h"
-#include "gromacs/fileio/futil.h"
  #include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
  #include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
  #include "gromacs/timing/cyclecounter.h"
  #include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
  #include "gromacs/utility/gmxmpi.h"
  #include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
  
  /* Include the SIMD macro file and then check for support */
-#include "gromacs/simd/macros.h"
-#if defined GMX_HAVE_SIMD_MACROS
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#ifdef GMX_SIMD_HAVE_REAL
  /* Turn on arbitrary width SIMD intrinsics for PME solve */
-#define PME_SIMD_SOLVE
+#    define PME_SIMD_SOLVE
  #endif
  
  #define PME_GRID_QA    0 /* Gridindex for A-state for Q */
@@ -108,21 +112,19 @@ const real lb_scale_factor[] = {
  /* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
  const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
  
-/* Include the 4-wide SIMD macro file */
-#include "gromacs/simd/four_wide_macros.h"
  /* Check if we have 4-wide SIMD macro support */
-#ifdef GMX_HAVE_SIMD4_MACROS
+#if (defined GMX_SIMD4_HAVE_REAL)
  /* Do PME spread and gather with 4-wide SIMD.
   * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
   */
-#define PME_SIMD4_SPREAD_GATHER
+#    define PME_SIMD4_SPREAD_GATHER
  
-#ifdef GMX_SIMD4_HAVE_UNALIGNED
+#    if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
  /* With PME-order=4 on x86, unaligned load+store is slightly faster
   * than doubling all SIMD operations when using aligned load+store.
   */
-#define PME_SIMD4_UNALIGNED
-#endif
+#        define PME_SIMD4_UNALIGNED
+#    endif
  #endif
  
  #define DFT_TOL 1e-7
@@ -140,10 +142,10 @@ const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
  #endif
  
  #ifdef PME_SIMD4_SPREAD_GATHER
-#define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
+#    define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
  #else
  /* We can use any alignment, apart from 0, so we use 4 reals */
-#define SIMD4_ALIGNMENT  (4*sizeof(real))
+#    define SIMD4_ALIGNMENT  (4*sizeof(real))
  #endif
  
  /* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
@@ -221,7 +223,7 @@ typedef struct {
      int      n;
      int      nalloc;
      rvec    *x;
-    real    *q;
+    real    *coefficient;
      rvec    *f;
      gmx_bool bSpread;       /* These coordinates are used for spreading */
      int      pme_order;
@@ -230,7 +232,7 @@ typedef struct {
                                   * the lower cell boundary
                                   */
      int             nthread;
-    int            *thread_idx; /* Which thread should spread which charge */
+    int            *thread_idx; /* Which thread should spread which coefficient */
      thread_plist_t *thread_plist;
      splinedata_t   *spline;
  } pme_atomcomm_t;
@@ -260,9 +262,9 @@ typedef struct {
  typedef struct {
  #ifdef PME_SIMD4_SPREAD_GATHER
      /* Masks for 4-wide SIMD aligned spreading and gathering */
-    gmx_simd4_pb mask_S0[6], mask_S1[6];
+    gmx_simd4_bool_t mask_S0[6], mask_S1[6];
  #else
-    int          dummy; /* C89 requires that struct has at least one member */
+    int              dummy; /* C89 requires that struct has at least one member */
  #endif
  } pme_spline_work_t;
  
@@ -313,6 +315,8 @@ typedef struct gmx_pme {
      int        pme_order;
      real       epsilon_r;
  
+    int        ljpme_combination_rule;  /* Type of combination rule in LJ-PME */
+
      int        ngrids;                  /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
  
      pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
@@ -324,7 +328,7 @@ typedef struct gmx_pme {
                                           * This can probably be done in a better way
                                           * but this simple hack works for now
                                           */
-    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
+    /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
      int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
      /* pmegrid_nz might be larger than strictly necessary to ensure
       * memory alignment, pmegrid_nz_base gives the real base size.
@@ -354,7 +358,7 @@ typedef struct gmx_pme {
      splinevec              bsp_mod;
      /* Buffers to store data for local atoms for L-B combination rule
       * calculations in LJ-PME. lb_buf1 stores either the coefficients
-     * for spreading/gathering (in serial), or the C6 parameters for
+     * for spreading/gathering (in serial), or the C6 coefficient for
       * local atoms (in parallel).  lb_buf2 is only used in parallel,
       * and stores the sigma values for local atoms. */
      real                 *lb_buf1, *lb_buf2;
@@ -371,24 +375,13 @@ typedef struct gmx_pme {
      /* thread local work data for solve_pme */
      pme_work_t *work;
  
-    /* Work data for PME_redist */
-    gmx_bool redist_init;
-    int *    scounts;
-    int *    rcounts;
-    int *    sdispls;
-    int *    rdispls;
-    int *    sidx;
-    int *    idxa;
-    real *   redist_buf;
-    int      redist_buf_nalloc;
-
      /* Work data for sum_qgrid */
      real *   sum_qgrid_tmp;
      real *   sum_qgrid_dd_tmp;
  } t_gmx_pme;
  
  static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
-                                   int start, int end, int thread)
+                                   int start, int grid_index, int end, int thread)
  {
      int             i;
      int            *idxptr, tix, tiy, tiz;
@@ -418,9 +411,9 @@ static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
      rzy = pme->recipbox[ZZ][YY];
      rzz = pme->recipbox[ZZ][ZZ];
  
-    g2tx = pme->pmegrid[PME_GRID_QA].g2t[XX];
-    g2ty = pme->pmegrid[PME_GRID_QA].g2t[YY];
-    g2tz = pme->pmegrid[PME_GRID_QA].g2t[ZZ];
+    g2tx = pme->pmegrid[grid_index].g2t[XX];
+    g2ty = pme->pmegrid[grid_index].g2t[YY];
+    g2tz = pme->pmegrid[grid_index].g2t[ZZ];
  
      bThreads = (atc->nthread > 1);
      if (bThreads)
@@ -671,7 +664,7 @@ static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
          if (atc->nslab > 1)
          {
              srenew(atc->x, atc->nalloc);
-            srenew(atc->q, atc->nalloc);
+            srenew(atc->coefficient, atc->nalloc);
              srenew(atc->f, atc->nalloc);
              for (i = nalloc_old; i < atc->nalloc; i++)
              {
@@ -696,117 +689,6 @@ static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
      }
  }
  
-static void pmeredist_pd(gmx_pme_t pme, gmx_bool gmx_unused forw,
-                         int n, gmx_bool gmx_unused bXF, rvec gmx_unused *x_f,
-                         real gmx_unused *charge, pme_atomcomm_t *atc)
-/* Redistribute particle data for PME calculation */
-/* domain decomposition by x coordinate           */
-{
-    int *idxa;
-    int  i, ii;
-
-    if (FALSE == pme->redist_init)
-    {
-        snew(pme->scounts, atc->nslab);
-        snew(pme->rcounts, atc->nslab);
-        snew(pme->sdispls, atc->nslab);
-        snew(pme->rdispls, atc->nslab);
-        snew(pme->sidx, atc->nslab);
-        pme->redist_init = TRUE;
-    }
-    if (n > pme->redist_buf_nalloc)
-    {
-        pme->redist_buf_nalloc = over_alloc_dd(n);
-        srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
-    }
-
-    pme->idxa = atc->pd;
-
-#ifdef GMX_MPI
-    if (forw && bXF)
-    {
-        /* forward, redistribution from pp to pme */
-
-        /* Calculate send counts and exchange them with other nodes */
-        for (i = 0; (i < atc->nslab); i++)
-        {
-            pme->scounts[i] = 0;
-        }
-        for (i = 0; (i < n); i++)
-        {
-            pme->scounts[pme->idxa[i]]++;
-        }
-        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-
-        /* Calculate send and receive displacements and index into send
-           buffer */
-        pme->sdispls[0] = 0;
-        pme->rdispls[0] = 0;
-        pme->sidx[0]    = 0;
-        for (i = 1; i < atc->nslab; i++)
-        {
-            pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
-            pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
-            pme->sidx[i]    = pme->sdispls[i];
-        }
-        /* Total # of particles to be received */
-        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-
-        pme_realloc_atomcomm_things(atc);
-
-        /* Copy particle coordinates into send buffer and exchange*/
-        for (i = 0; (i < n); i++)
-        {
-            ii = DIM*pme->sidx[pme->idxa[i]];
-            pme->sidx[pme->idxa[i]]++;
-            pme->redist_buf[ii+XX] = x_f[i][XX];
-            pme->redist_buf[ii+YY] = x_f[i][YY];
-            pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
-        }
-        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
-                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
-                      pme->rvec_mpi, atc->mpi_comm);
-    }
-    if (forw)
-    {
-        /* Copy charge into send buffer and exchange*/
-        for (i = 0; i < atc->nslab; i++)
-        {
-            pme->sidx[i] = pme->sdispls[i];
-        }
-        for (i = 0; (i < n); i++)
-        {
-            ii = pme->sidx[pme->idxa[i]];
-            pme->sidx[pme->idxa[i]]++;
-            pme->redist_buf[ii] = charge[i];
-        }
-        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
-                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
-                      atc->mpi_comm);
-    }
-    else   /* backward, redistribution from pme to pp */
-    {
-        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
-                      pme->redist_buf, pme->scounts, pme->sdispls,
-                      pme->rvec_mpi, atc->mpi_comm);
-
-        /* Copy data from receive buffer */
-        for (i = 0; i < atc->nslab; i++)
-        {
-            pme->sidx[i] = pme->sdispls[i];
-        }
-        for (i = 0; (i < n); i++)
-        {
-            ii          = DIM*pme->sidx[pme->idxa[i]];
-            x_f[i][XX] += pme->redist_buf[ii+XX];
-            x_f[i][YY] += pme->redist_buf[ii+YY];
-            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
-            pme->sidx[pme->idxa[i]]++;
-        }
-    }
-#endif
-}
-
  static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
                              gmx_bool gmx_unused bBackward, int gmx_unused shift,
                              void gmx_unused *buf_s, int gmx_unused nbyte_s,
@@ -850,9 +732,9 @@ static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
  #endif
  }
  
-static void dd_pmeredist_x_q(gmx_pme_t pme,
-                             int n, gmx_bool bX, rvec *x, real *charge,
-                             pme_atomcomm_t *atc)
+static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
+                                    int n, gmx_bool bX, rvec *x, real *data,
+                                    pme_atomcomm_t *atc)
  {
      int *commnode, *buf_index;
      int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
@@ -872,7 +754,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
      {
          if (atc->count[atc->nodeid] + nsend != n)
          {
-            gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+            gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
                        "This usually means that your system is not well equilibrated.",
                        n - (atc->count[atc->nodeid] + nsend),
                        pme->nodeid, 'x'+atc->dimind);
@@ -892,7 +774,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
              /* Communicate the count */
              if (debug)
              {
-                fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+                fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
                          atc->dimind, atc->nodeid, commnode[i], scount);
              }
              pme_dd_sendrecv(atc, FALSE, i,
@@ -915,7 +797,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
              {
                  copy_rvec(x[i], atc->x[local_pos]);
              }
-            atc->q[local_pos] = charge[i];
+            atc->coefficient[local_pos] = data[i];
              local_pos++;
          }
          else
@@ -925,7 +807,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
              {
                  copy_rvec(x[i], pme->bufv[buf_index[node]]);
              }
-            pme->bufr[buf_index[node]] = charge[i];
+            pme->bufr[buf_index[node]] = data[i];
              buf_index[node]++;
          }
      }
@@ -944,10 +826,10 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
                                  pme->bufv[buf_pos], scount*sizeof(rvec),
                                  atc->x[local_pos], rcount*sizeof(rvec));
              }
-            /* Communicate the charges */
+            /* Communicate the coefficients */
              pme_dd_sendrecv(atc, FALSE, i,
                              pme->bufr+buf_pos, scount*sizeof(real),
-                            atc->q+local_pos, rcount*sizeof(real));
+                            atc->coefficient+local_pos, rcount*sizeof(real));
              buf_pos   += scount;
              local_pos += atc->rcount[i];
          }
@@ -1045,7 +927,7 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
          /* Since we have already (un)wrapped the overlap in the z-dimension,
           * we only have to communicate 0 to nkz (not pmegrid_nz).
           */
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
          {
              send_id       = overlap->send_id[ipulse];
              recv_id       = overlap->recv_id[ipulse];
@@ -1067,7 +949,7 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
          /* Copy data to contiguous send buffer */
          if (debug)
          {
-            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
                      pme->nodeid, overlap->nodeid, send_id,
                      pme->pmegrid_start_iy,
                      send_index0-pme->pmegrid_start_iy,
@@ -1099,7 +981,7 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
          /* Get data from contiguous recv buffer */
          if (debug)
          {
-            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
                      pme->nodeid, overlap->nodeid, recv_id,
                      pme->pmegrid_start_iy,
                      recv_index0-pme->pmegrid_start_iy,
@@ -1115,7 +997,7 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                  for (k = 0; k < pme->nkz; k++)
                  {
                      iz = k;
-                    if (direction == GMX_SUM_QGRID_FORWARD)
+                    if (direction == GMX_SUM_GRID_FORWARD)
                      {
                          grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
                      }
@@ -1137,7 +1019,7 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
  
      for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
      {
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
          {
              send_id       = overlap->send_id[ipulse];
              recv_id       = overlap->recv_id[ipulse];
@@ -1163,12 +1045,12 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
  
          if (debug)
          {
-            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
                      pme->nodeid, overlap->nodeid, send_id,
                      pme->pmegrid_start_ix,
                      send_index0-pme->pmegrid_start_ix,
                      send_index0-pme->pmegrid_start_ix+send_nindex);
-            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
                      pme->nodeid, overlap->nodeid, recv_id,
                      pme->pmegrid_start_ix,
                      recv_index0-pme->pmegrid_start_ix,
@@ -1182,7 +1064,7 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                       overlap->mpi_comm, &stat);
  
          /* ADD data from contiguous recv buffer */
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
          {
              p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
              for (i = 0; i < recv_nindex*datasize; i++)
@@ -1218,13 +1100,12 @@ static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid,
      {
  #ifdef DEBUG_PME
          FILE *fp, *fp2;
-        char  fn[STRLEN], format[STRLEN];
+        char  fn[STRLEN];
          real  val;
          sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
-        fp = ffopen(fn, "w");
+        fp = gmx_ffopen(fn, "w");
          sprintf(fn, "pmegrid%d.txt", pme->nodeid);
-        fp2 = ffopen(fn, "w");
-        sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
+        fp2 = gmx_ffopen(fn, "w");
  #endif
  
          for (ix = 0; ix < local_fft_ndata[XX]; ix++)
@@ -1240,8 +1121,8 @@ static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid,
                      val = 100*pmegrid[pmeidx];
                      if (pmegrid[pmeidx] != 0)
                      {
-                        fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
-                                5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+                        gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
+                                                 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
                      }
                      if (pmegrid[pmeidx] != 0)
                      {
@@ -1257,8 +1138,8 @@ static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid,
              }
          }
  #ifdef DEBUG_PME
-        ffclose(fp);
-        ffclose(fp2);
+        gmx_ffclose(fp);
+        gmx_ffclose(fp2);
  #endif
      }
      return 0;
@@ -1470,7 +1351,7 @@ static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
      for (ithx = 0; (ithx < order); ithx++)                    \
      {                                                    \
          index_x = (i0+ithx)*pny*pnz;                     \
-        valx    = qn*thx[ithx];                          \
+        valx    = coefficient*thx[ithx];                          \
                                                       \
          for (ithy = 0; (ithy < order); ithy++)                \
          {                                                \
@@ -1486,28 +1367,28 @@ static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
      }
  
  
-static void spread_q_bsplines_thread(pmegrid_t                    *pmegrid,
-                                     pme_atomcomm_t               *atc,
-                                     splinedata_t                 *spline,
-                                     pme_spline_work_t gmx_unused *work)
+static void spread_coefficients_bsplines_thread(pmegrid_t                    *pmegrid,
+                                                pme_atomcomm_t               *atc,
+                                                splinedata_t                 *spline,
+                                                pme_spline_work_t gmx_unused *work)
  {
  
-    /* spread charges from home atoms to local grid */
+    /* spread coefficients from home atoms to local grid */
      real          *grid;
      pme_overlap_t *ol;
      int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
      int       *    idxptr;
      int            order, norder, index_x, index_xy, index_xyz;
-    real           valx, valxy, qn;
+    real           valx, valxy, coefficient;
      real          *thx, *thy, *thz;
      int            localsize, bndsize;
      int            pnx, pny, pnz, ndatatot;
      int            offx, offy, offz;
  
  #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
-    real           thz_buffer[12], *thz_aligned;
+    real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
  
-    thz_aligned = gmx_simd4_align_real(thz_buffer);
+    thz_aligned = gmx_simd4_align_r(thz_buffer);
  #endif
  
      pnx = pmegrid->s[XX];
@@ -1529,10 +1410,10 @@ static void spread_q_bsplines_thread(pmegrid_t                    *pmegrid,
  
      for (nn = 0; nn < spline->n; nn++)
      {
-        n  = spline->ind[nn];
-        qn = atc->q[n];
+        n           = spline->ind[nn];
+        coefficient = atc->coefficient[n];
  
-        if (qn != 0)
+        if (coefficient != 0)
          {
              idxptr = atc->idx[n];
              norder = nn*order;
@@ -1555,7 +1436,7 @@ static void spread_q_bsplines_thread(pmegrid_t                    *pmegrid,
  #define PME_SPREAD_SIMD4_ALIGNED
  #define PME_ORDER 4
  #endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
  #else
                      DO_BSPLINE(4);
  #endif
@@ -1564,7 +1445,7 @@ static void spread_q_bsplines_thread(pmegrid_t                    *pmegrid,
  #ifdef PME_SIMD4_SPREAD_GATHER
  #define PME_SPREAD_SIMD4_ALIGNED
  #define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
  #else
                      DO_BSPLINE(5);
  #endif
@@ -1877,7 +1758,7 @@ static void realloc_work(pme_work_t *work, int nkx)
           * elements at the end for padding.
           */
  #ifdef PME_SIMD_SOLVE
-        simd_width = GMX_SIMD_WIDTH_HERE;
+        simd_width = GMX_SIMD_REAL_WIDTH;
  #else
          /* We can use any alignment, apart from 0, so we use 4 */
          simd_width = 4;
@@ -1909,34 +1790,34 @@ static void free_work(pme_work_t *work)
  }
  
  
-#if defined PME_SIMD_SOLVE && defined GMX_SIMD_HAVE_EXP
+#if defined PME_SIMD_SOLVE
  /* Calculate exponentials through SIMD */
-inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
  {
      {
-        const gmx_mm_pr two = gmx_set1_pr(2.0);
-        gmx_mm_pr f_simd;
-        gmx_mm_pr lu;
-        gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
+        const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
+        gmx_simd_real_t f_simd;
+        gmx_simd_real_t lu;
+        gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
          int kx;
-        f_simd = gmx_set1_pr(f);
+        f_simd = gmx_simd_set1_r(f);
          /* We only need to calculate from start. But since start is 0 or 1
           * and we want to use aligned loads/stores, we always start from 0.
           */
-        for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+        for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
          {
-            tmp_d1   = gmx_load_pr(d_aligned+kx);
-            d_inv    = gmx_inv_pr(tmp_d1);
-            tmp_r    = gmx_load_pr(r_aligned+kx);
-            tmp_r    = gmx_exp_pr(tmp_r);
-            tmp_e    = gmx_mul_pr(f_simd, d_inv);
-            tmp_e    = gmx_mul_pr(tmp_e, tmp_r);
-            gmx_store_pr(e_aligned+kx, tmp_e);
+            tmp_d1   = gmx_simd_load_r(d_aligned+kx);
+            d_inv    = gmx_simd_inv_r(tmp_d1);
+            tmp_r    = gmx_simd_load_r(r_aligned+kx);
+            tmp_r    = gmx_simd_exp_r(tmp_r);
+            tmp_e    = gmx_simd_mul_r(f_simd, d_inv);
+            tmp_e    = gmx_simd_mul_r(tmp_e, tmp_r);
+            gmx_simd_store_r(e_aligned+kx, tmp_e);
          }
      }
  }
  #else
-inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
+gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
  {
      int kx;
      for (kx = start; kx < end; kx++)
@@ -1954,31 +1835,31 @@ inline static void calc_exponentials_q(int start, int end, real f, real *d, real
  }
  #endif
  
-#if defined PME_SIMD_SOLVE && defined GMX_SIMD_HAVE_ERFC
+#if defined PME_SIMD_SOLVE
  /* Calculate exponentials through SIMD */
-inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
+gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
  {
-    gmx_mm_pr tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
-    const gmx_mm_pr sqr_PI = gmx_sqrt_pr(gmx_set1_pr(M_PI));
+    gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
+    const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
      int kx;
-    for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+    for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
      {
          /* We only need to calculate from start. But since start is 0 or 1
           * and we want to use aligned loads/stores, we always start from 0.
           */
-        tmp_d = gmx_load_pr(d_aligned+kx);
-        d_inv = gmx_inv_pr(tmp_d);
-        gmx_store_pr(d_aligned+kx, d_inv);
-        tmp_r = gmx_load_pr(r_aligned+kx);
-        tmp_r = gmx_exp_pr(tmp_r);
-        gmx_store_pr(r_aligned+kx, tmp_r);
-        tmp_mk  = gmx_load_pr(factor_aligned+kx);
-        tmp_fac = gmx_mul_pr(sqr_PI, gmx_mul_pr(tmp_mk, gmx_erfc_pr(tmp_mk)));
-        gmx_store_pr(factor_aligned+kx, tmp_fac);
+        tmp_d = gmx_simd_load_r(d_aligned+kx);
+        d_inv = gmx_simd_inv_r(tmp_d);
+        gmx_simd_store_r(d_aligned+kx, d_inv);
+        tmp_r = gmx_simd_load_r(r_aligned+kx);
+        tmp_r = gmx_simd_exp_r(tmp_r);
+        gmx_simd_store_r(r_aligned+kx, tmp_r);
+        tmp_mk  = gmx_simd_load_r(factor_aligned+kx);
+        tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
+        gmx_simd_store_r(factor_aligned+kx, tmp_fac);
      }
  }
  #else
-inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
+gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
  {
      int kx;
      real mk;
@@ -2554,6 +2435,7 @@ static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
          work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
          work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
          work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
+
          /* This energy should be corrected for a charged system */
          work->energy_lj = 0.5*energy;
      }
@@ -2635,7 +2517,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
      int     index_x, index_xy;
      int     nx, ny, nz, pnx, pny, pnz;
      int *   idxptr;
-    real    tx, ty, dx, dy, qn;
+    real    tx, ty, dx, dy, coefficient;
      real    fx, fy, fz, gval;
      real    fxy1, fz1;
      real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
@@ -2646,11 +2528,11 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
      pme_spline_work_t *work;
  
  #if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
-    real           thz_buffer[12],  *thz_aligned;
-    real           dthz_buffer[12], *dthz_aligned;
+    real           thz_buffer[GMX_SIMD4_WIDTH*3],  *thz_aligned;
+    real           dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
  
-    thz_aligned  = gmx_simd4_align_real(thz_buffer);
-    dthz_aligned = gmx_simd4_align_real(dthz_buffer);
+    thz_aligned  = gmx_simd4_align_r(thz_buffer);
+    dthz_aligned = gmx_simd4_align_r(dthz_buffer);
  #endif
  
      work = pme->spline_work;
@@ -2678,8 +2560,8 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
  
      for (nn = 0; nn < spline->n; nn++)
      {
-        n  = spline->ind[nn];
-        qn = scale*atc->q[n];
+        n           = spline->ind[nn];
+        coefficient = scale*atc->coefficient[n];
  
          if (bClearF)
          {
@@ -2687,7 +2569,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
              atc->f[n][YY] = 0;
              atc->f[n][ZZ] = 0;
          }
-        if (qn != 0)
+        if (coefficient != 0)
          {
              fx     = 0;
              fy     = 0;
@@ -2717,7 +2599,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
  #define PME_GATHER_F_SIMD4_ALIGNED
  #define PME_ORDER 4
  #endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
  #else
                      DO_FSPLINE(4);
  #endif
@@ -2726,7 +2608,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
  #ifdef PME_SIMD4_SPREAD_GATHER
  #define PME_GATHER_F_SIMD4_ALIGNED
  #define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
  #else
                      DO_FSPLINE(5);
  #endif
@@ -2736,9 +2618,9 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
                      break;
              }
  
-            atc->f[n][XX] += -qn*( fx*nx*rxx );
-            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
-            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
+            atc->f[n][XX] += -coefficient*( fx*nx*rxx );
+            atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
+            atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
          }
      }
      /* Since the energy and not forces are interpolated
@@ -2760,7 +2642,7 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
      int     n, ithx, ithy, ithz, i0, j0, k0;
      int     index_x, index_xy;
      int *   idxptr;
-    real    energy, pot, tx, ty, qn, gval;
+    real    energy, pot, tx, ty, coefficient, gval;
      real    *thx, *thy, *thz;
      int     norder;
      int     order;
@@ -2772,9 +2654,9 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
      energy = 0;
      for (n = 0; (n < atc->n); n++)
      {
-        qn      = atc->q[n];
+        coefficient      = atc->coefficient[n];
  
-        if (qn != 0)
+        if (coefficient != 0)
          {
              idxptr = atc->idx[n];
              norder = n*order;
@@ -2808,7 +2690,7 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
                  }
              }
  
-            energy += pot*qn;
+            energy += pot*coefficient;
          }
      }
  
@@ -2870,7 +2752,7 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
      }
  
  void make_bsplines(splinevec theta, splinevec dtheta, int order,
-                   rvec fractx[], int nr, int ind[], real charge[],
+                   rvec fractx[], int nr, int ind[], real coefficient[],
                     gmx_bool bDoSplines)
  {
      /* construct splines for local atoms */
@@ -2879,12 +2761,12 @@ void make_bsplines(splinevec theta, splinevec dtheta, int order,
  
      for (i = 0; i < nr; i++)
      {
-        /* With free energy we do not use the charge check.
+        /* With free energy we do not use the coefficient check.
           * In most cases this will be more efficient than calling make_bsplines
-         * twice, since usually more than half the particles have charges.
+         * twice, since usually more than half the particles have non-zero coefficients.
           */
          ii = ind[i];
-        if (bDoSplines || charge[ii] != 0.0)
+        if (bDoSplines || coefficient[ii] != 0.0)
          {
              xptr = fractx[ii];
              switch (order)
@@ -3176,7 +3058,6 @@ static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
  
      if (atc->nslab > 1)
      {
-        /* These three allocations are not required for particle decomp. */
          snew(atc->node_dest, atc->nslab);
          snew(atc->node_src, atc->nslab);
          setup_coordinate_communication(atc);
@@ -3252,8 +3133,9 @@ init_overlap_comm(pme_overlap_t *  ol,
      {
          /* s2g0 the local interpolation grid start.
           * s2g1 the local interpolation grid end.
-         * Because grid overlap communication only goes forward,
-         * the grid the slabs for fft's should be rounded down.
+         * Since in calc_pidx we divide particles, and not grid lines,
+         * spatially uniform along dimension x or y, we need to round
+         * s2g0 down and s2g1 up.
           */
          ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
          ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
@@ -3399,31 +3281,31 @@ static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
      pme_spline_work_t *work;
  
  #ifdef PME_SIMD4_SPREAD_GATHER
-    real         tmp[12], *tmp_aligned;
-    gmx_simd4_pr zero_S;
-    gmx_simd4_pr real_mask_S0, real_mask_S1;
-    int          of, i;
+    real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
+    gmx_simd4_real_t zero_S;
+    gmx_simd4_real_t real_mask_S0, real_mask_S1;
+    int              of, i;
  
      snew_aligned(work, 1, SIMD4_ALIGNMENT);
  
-    tmp_aligned = gmx_simd4_align_real(tmp);
+    tmp_aligned = gmx_simd4_align_r(tmp);
  
-    zero_S = gmx_simd4_setzero_pr();
+    zero_S = gmx_simd4_setzero_r();
  
      /* Generate bit masks to mask out the unused grid entries,
       * as we only operate on order of the 8 grid entries that are
       * load into 2 SIMD registers.
       */
-    for (of = 0; of < 8-(order-1); of++)
+    for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
      {
-        for (i = 0; i < 8; i++)
+        for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
          {
              tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
          }
-        real_mask_S0      = gmx_simd4_load_pr(tmp_aligned);
-        real_mask_S1      = gmx_simd4_load_pr(tmp_aligned+4);
-        work->mask_S0[of] = gmx_simd4_cmplt_pr(real_mask_S0, zero_S);
-        work->mask_S1[of] = gmx_simd4_cmplt_pr(real_mask_S1, zero_S);
+        real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
+        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
+        work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
+        work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
      }
  #else
      work = NULL;
@@ -3475,7 +3357,7 @@ void gmx_pme_check_restrictions(int pme_order,
              *bValidSettings = FALSE;
              return;
          }
-        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+        gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
                    nkx/(double)nnodes_major, pme_order);
      }
  
@@ -3509,11 +3391,9 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
      }
      snew(pme, 1);
  
-    pme->redist_init         = FALSE;
      pme->sum_qgrid_tmp       = NULL;
      pme->sum_qgrid_dd_tmp    = NULL;
      pme->buf_nalloc          = 0;
-    pme->redist_buf_nalloc   = 0;
  
      pme->nnodes              = 1;
      pme->bPPnode             = TRUE;
@@ -3530,7 +3410,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
          MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
          if (pme->nnodes != nnodes_major*nnodes_minor)
          {
-            gmx_incons("PME node count mismatch");
+            gmx_incons("PME rank count mismatch");
          }
      }
      else
@@ -3579,7 +3459,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
          {
              if (pme->nnodes % nnodes_major != 0)
              {
-                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+                gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
              }
              pme->ndecompdim = 2;
  
@@ -3628,8 +3508,13 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
      pme->nkz         = ir->nkz;
      pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
      pme->pme_order   = ir->pme_order;
+
+    /* Always constant electrostatics coefficients */
      pme->epsilon_r   = ir->epsilon_r;
  
+    /* Always constant LJ coefficients */
+    pme->ljpme_combination_rule = ir->ljpme_combination_rule;
+
      /* If we violate restrictions, generate a fatal error here */
      gmx_pme_check_restrictions(pme->pme_order,
                                 pme->nkx, pme->nky, pme->nkz,
@@ -3648,10 +3533,10 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
          MPI_Type_commit(&(pme->rvec_mpi));
  #endif
  
-        /* Note that the charge spreading and force gathering, which usually
+        /* Note that the coefficient spreading and force gathering, which usually
           * takes about the same amount of time as FFT+solve_pme,
           * is always fully load balanced
-         * (unless the charge distribution is inhomogeneous).
+         * (unless the coefficient distribution is inhomogeneous).
           */
  
          imbal = pme_load_imbalance(pme);
@@ -3661,8 +3546,8 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                      "\n"
                      "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
                      "      For optimal PME load balancing\n"
-                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
-                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
+                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
                      "\n",
                      (int)((imbal-1)*100 + 0.5),
                      pme->nkx, pme->nky, pme->nnodes_major,
@@ -3740,14 +3625,28 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
      ndata[0]    = pme->nkx;
      ndata[1]    = pme->nky;
      ndata[2]    = pme->nkz;
-    pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
+    /* It doesn't matter if we allocate too many grids here,
+     * we only allocate and use the ones we need.
+     */
+    if (EVDW_PME(ir->vdwtype))
+    {
+        pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
+    }
+    else
+    {
+        pme->ngrids = DO_Q;
+    }
      snew(pme->fftgrid, pme->ngrids);
      snew(pme->cfftgrid, pme->ngrids);
      snew(pme->pfft_setup, pme->ngrids);
  
      for (i = 0; i < pme->ngrids; ++i)
      {
-        if (((ir->ljpme_combination_rule == eljpmeLB) && i >= 2) || i % 2 == 0 || bFreeEnergy_q || bFreeEnergy_lj)
+        if ((i <  DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
+                                                       bFreeEnergy_q)) ||
+            (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
+                                                    bFreeEnergy_lj ||
+                                                    ir->ljpme_combination_rule == eljpmeLB)))
          {
              pmegrids_init(&pme->pmegrid[i],
                            pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
@@ -3874,8 +3773,8 @@ int gmx_pme_reinit(gmx_pme_t *         pmedata,
  }
  
  
-static void copy_local_grid(gmx_pme_t pme,
-                            pmegrids_t *pmegrids, int thread, real *fftgrid)
+static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
+                            int grid_index, int thread, real *fftgrid)
  {
      ivec local_fft_ndata, local_fft_offset, local_fft_size;
      int  fft_my, fft_mz;
@@ -3886,7 +3785,7 @@ static void copy_local_grid(gmx_pme_t pme,
      pmegrid_t *pmegrid;
      real *grid_th;
  
-    gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                     local_fft_ndata,
                                     local_fft_offset,
                                     local_fft_size);
@@ -3930,14 +3829,15 @@ static void copy_local_grid(gmx_pme_t pme,
  static void
  reduce_threadgrid_overlap(gmx_pme_t pme,
                            const pmegrids_t *pmegrids, int thread,
-                          real *fftgrid, real *commbuf_x, real *commbuf_y)
+                          real *fftgrid, real *commbuf_x, real *commbuf_y,
+                          int grid_index)
  {
      ivec local_fft_ndata, local_fft_offset, local_fft_size;
      int  fft_nx, fft_ny, fft_nz;
      int  fft_my, fft_mz;
      int  buf_my = -1;
      int  nsx, nsy, nsz;
-    ivec ne;
+    ivec localcopy_end;
      int  offx, offy, offz, x, y, z, i0, i0t;
      int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
      gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
@@ -3948,7 +3848,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
      const real *grid_th;
      real *commbuf = NULL;
  
-    gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                     local_fft_ndata,
                                     local_fft_offset,
                                     local_fft_size);
@@ -3973,8 +3873,14 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
  
      for (d = 0; d < DIM; d++)
      {
-        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
-                    local_fft_ndata[d]);
+        /* Determine up to where our thread needs to copy from the
+         * thread-local charge spreading grid to the rank-local FFT grid.
+         * This is up to our spreading grid end minus order-1 and
+         * not beyond the local FFT grid.
+         */
+        localcopy_end[d] =
+            min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+                local_fft_ndata[d]);
      }
  
      offx = pmegrid->offset[XX];
@@ -3989,7 +3895,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
      /* Now loop over all the thread data blocks that contribute
       * to the grid region we (our thread) are operating on.
       */
-    /* Note that ffy_nx/y is equal to the number of grid points
+    /* Note that fft_nx/y is equal to the number of grid points
       * between the first point of our node grid and the one of the next node.
       */
      for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
@@ -4005,12 +3911,15 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
          }
          pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
          ox       += pmegrid_g->offset[XX];
+        /* Determine the end of our part of the source grid */
          if (!bCommX)
          {
-            tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+            /* Use our thread local source grid and target grid part */
+            tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
          }
          else
          {
+            /* Use our thread local source grid and the spreading range */
              tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
          }
  
@@ -4027,12 +3936,15 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
              }
              pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
              oy       += pmegrid_g->offset[YY];
+            /* Determine the end of our part of the source grid */
              if (!bCommY)
              {
-                ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+                /* Use our thread local source grid and target grid part */
+                ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
              }
              else
              {
+                /* Use our thread local source grid and the spreading range */
                  ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
              }
  
@@ -4047,7 +3959,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                  }
                  pmegrid_g = &pmegrids->grid_th[fz];
                  oz       += pmegrid_g->offset[ZZ];
-                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+                tz1       = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
  
                  if (sx == 0 && sy == 0 && sz == 0)
                  {
@@ -4103,7 +4015,13 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                      if (bCommY)
                      {
                          commbuf = commbuf_y;
-                        buf_my  = ty1 - offy;
+                        /* The y-size of the communication buffer is set by
+                         * the overlap of the grid part of our local slab
+                         * with the part starting at the next slab.
+                         */
+                        buf_my  =
+                            pme->overlap[1].s2g1[pme->nodeid_minor] -
+                            pme->overlap[1].s2g0[pme->nodeid_minor+1];
                          if (bCommX)
                          {
                              /* We index commbuf modulo the local grid size */
@@ -4158,7 +4076,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
  }
  
  
-static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
+static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
  {
      ivec local_fft_ndata, local_fft_offset, local_fft_size;
      pme_overlap_t *overlap;
@@ -4173,13 +4091,13 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
      int  x, y, z, indg, indb;
  
      /* Note that this routine is only used for forward communication.
-     * Since the force gathering, unlike the charge spreading,
+     * Since the force gathering, unlike the coefficient spreading,
       * can be trivially parallelized over the particles,
       * the backwards process is much simpler and can use the "old"
       * communication setup.
       */
  
-    gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                     local_fft_ndata,
                                     local_fft_offset,
                                     local_fft_size);
@@ -4216,6 +4134,12 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
              sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
              recvptr = overlap->recvbuf;
  
+            if (debug != NULL)
+            {
+                fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+                        local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+            }
+
  #ifdef GMX_MPI
              MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
                           send_id, ipulse,
@@ -4282,7 +4206,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
  
          if (debug != NULL)
          {
-            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+            fprintf(debug, "PME fftgrid comm x %2d x %2d x %2d\n",
                      send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
          }
  
@@ -4313,7 +4237,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
  static void spread_on_grid(gmx_pme_t pme,
                             pme_atomcomm_t *atc, pmegrids_t *grids,
                             gmx_bool bCalcSplines, gmx_bool bSpread,
-                           real *fftgrid, gmx_bool bDoSplines )
+                           real *fftgrid, gmx_bool bDoSplines, int grid_index)
  {
      int nthread, thread;
  #ifdef PME_TIME_THREADS
@@ -4342,7 +4266,7 @@ static void spread_on_grid(gmx_pme_t pme,
              /* Compute fftgrid index for all atoms,
               * with help of some extra variables.
               */
-            calc_interpolation_idx(pme, atc, start, end, thread);
+            calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
          }
      }
  #ifdef PME_TIME_THREADS
@@ -4377,7 +4301,7 @@ static void spread_on_grid(gmx_pme_t pme,
  
              if (grids->nthread == 1)
              {
-                /* One thread, we operate on all charges */
+                /* One thread, we operate on all coefficients */
                  spline->n = atc->n;
              }
              else
@@ -4392,7 +4316,7 @@ static void spread_on_grid(gmx_pme_t pme,
          if (bCalcSplines)
          {
              make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
-                          atc->fractx, spline->n, spline->ind, atc->q, bDoSplines);
+                          atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
          }
  
          if (bSpread)
@@ -4401,11 +4325,11 @@ static void spread_on_grid(gmx_pme_t pme,
  #ifdef PME_TIME_SPREAD
              ct1a = omp_cyc_start();
  #endif
-            spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
+            spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
  
              if (pme->bUseThreads)
              {
-                copy_local_grid(pme, grids, thread, fftgrid);
+                copy_local_grid(pme, grids, grid_index, thread, fftgrid);
              }
  #ifdef PME_TIME_SPREAD
              ct1a          = omp_cyc_end(ct1a);
@@ -4429,7 +4353,8 @@ static void spread_on_grid(gmx_pme_t pme,
              reduce_threadgrid_overlap(pme, grids, thread,
                                        fftgrid,
                                        pme->overlap[0].sendbuf,
-                                      pme->overlap[1].sendbuf);
+                                      pme->overlap[1].sendbuf,
+                                      grid_index);
          }
  #ifdef PME_TIME_THREADS
          c3   = omp_cyc_end(c3);
@@ -4442,7 +4367,7 @@ static void spread_on_grid(gmx_pme_t pme,
               * For this communication call we need to check pme->bUseThreads
               * to have all ranks communicate here, regardless of pme->nthread.
               */
-            sum_fftgrid_dd(pme, fftgrid);
+            sum_fftgrid_dd(pme, fftgrid, grid_index);
          }
      }
  
@@ -4514,7 +4439,7 @@ void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
      {
          gmx_incons("gmx_pme_calc_energy called in parallel");
      }
-    if (pme->bFEP > 1)
+    if (pme->bFEP_q > 1)
      {
          gmx_incons("gmx_pme_calc_energy with free energy");
      }
@@ -4530,14 +4455,14 @@ void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
      atc->pme_order = pme->pme_order;
      atc->n         = n;
      pme_realloc_atomcomm_things(atc);
-    atc->x         = x;
-    atc->q         = q;
+    atc->x           = x;
+    atc->coefficient = q;
  
      /* We only use the A-charges grid */
      grid = &pme->pmegrid[PME_GRID_QA];
  
      /* Only calculate the spline coefficients, don't actually spread */
-    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE);
+    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
  
      *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
  }
@@ -4641,7 +4566,7 @@ int gmx_pmeonly(gmx_pme_t pme,
          do
          {
              /* Domain decomposition */
-            ret = gmx_pme_recv_params_coords(pme_pp,
+            ret = gmx_pme_recv_coeffs_coords(pme_pp,
                                               &natoms,
                                               &chargeA, &chargeB,
                                               &c6A, &c6B,
@@ -4721,10 +4646,10 @@ calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
      {
          real sigma4;
  
-        sigma4           = local_sigma[i];
-        sigma4           = sigma4*sigma4;
-        sigma4           = sigma4*sigma4;
-        pme->atc[0].q[i] = local_c6[i] / sigma4;
+        sigma4                     = local_sigma[i];
+        sigma4                     = sigma4*sigma4;
+        sigma4                     = sigma4*sigma4;
+        pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
      }
  }
  
@@ -4735,13 +4660,13 @@ calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
  
      for (i = 0; i < pme->atc[0].n; ++i)
      {
-        pme->atc[0].q[i] *= local_sigma[i];
+        pme->atc[0].coefficient[i] *= local_sigma[i];
      }
  }
  
  static void
-do_redist_x_q(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
-              gmx_bool bFirst, rvec x[], real *data)
+do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
+                     gmx_bool bFirst, rvec x[], real *data)
  {
      int      d;
      pme_atomcomm_t *atc;
@@ -4751,19 +4676,19 @@ do_redist_x_q(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
      {
          int             n_d;
          rvec           *x_d;
-        real           *q_d;
+        real           *param_d;
  
          if (d == pme->ndecompdim - 1)
          {
-            n_d = homenr;
-            x_d = x + start;
-            q_d = data;
+            n_d     = homenr;
+            x_d     = x + start;
+            param_d = data;
          }
          else
          {
-            n_d = pme->atc[d + 1].n;
-            x_d = atc->x;
-            q_d = atc->q;
+            n_d     = pme->atc[d + 1].n;
+            x_d     = atc->x;
+            param_d = atc->coefficient;
          }
          atc      = &pme->atc[d];
          atc->npd = n_d;
@@ -4777,23 +4702,17 @@ do_redist_x_q(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
          /* Redistribute x (only once) and qA/c6A or qB/c6B */
          if (DOMAINDECOMP(cr))
          {
-            dd_pmeredist_x_q(pme, n_d, bFirst, x_d, q_d, atc);
-        }
-        else
-        {
-            pmeredist_pd(pme, TRUE, n_d, bFirst, x_d, q_d, atc);
+            dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
          }
      }
  }
  
-/* TODO: when adding free-energy support, sigmaB will no longer be
- * unused */
  int gmx_pme_do(gmx_pme_t pme,
                 int start,       int homenr,
                 rvec x[],        rvec f[],
                 real *chargeA,   real *chargeB,
                 real *c6A,       real *c6B,
-               real *sigmaA,    real gmx_unused *sigmaB,
+               real *sigmaA,    real *sigmaB,
                 matrix box, t_commrec *cr,
                 int  maxshift_x, int maxshift_y,
                 t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
@@ -4804,7 +4723,7 @@ int gmx_pme_do(gmx_pme_t pme,
                 real *dvdlambda_q, real *dvdlambda_lj,
                 int flags)
  {
-    int     q, d, i, j, ntot, npme, qmax;
+    int     d, i, j, k, ntot, npme, grid_index, max_grid_index;
      int     nx, ny, nz;
      int     n_d, local_ny;
      pme_atomcomm_t *atc = NULL;
@@ -4812,7 +4731,7 @@ int gmx_pme_do(gmx_pme_t pme,
      real    *grid       = NULL;
      real    *ptr;
      rvec    *x_d, *f_d;
-    real    *charge = NULL, *q_d;
+    real    *coefficient = NULL;
      real    energy_AB[4];
      matrix  vir_AB[4];
      real    scale, lambda;
@@ -4822,6 +4741,8 @@ int gmx_pme_do(gmx_pme_t pme,
      t_complex * cfftgrid;
      int     thread;
      gmx_bool bFirst, bDoSplines;
+    int fep_state;
+    int fep_states_lj           = pme->bFEP_lj ? 2 : 1;
      const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
      const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
  
@@ -4869,49 +4790,49 @@ int gmx_pme_do(gmx_pme_t pme,
      bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
  
      /* We need a maximum of four separate PME calculations:
-     * q=0: Coulomb PME with charges from state A
-     * q=1: Coulomb PME with charges from state B
-     * q=2: LJ PME with C6 from state A
-     * q=3: LJ PME with C6 from state B
+     * grid_index=0: Coulomb PME with charges from state A
+     * grid_index=1: Coulomb PME with charges from state B
+     * grid_index=2: LJ PME with C6 from state A
+     * grid_index=3: LJ PME with C6 from state B
       * For Lorentz-Berthelot combination rules, a separate loop is used to
       * calculate all the terms
       */
  
      /* If we are doing LJ-PME with LB, we only do Q here */
-    qmax = (flags & GMX_PME_LJ_LB) ? DO_Q : DO_Q_AND_LJ;
+    max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
  
-    for (q = 0; q < qmax; ++q)
+    for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
      {
-        /* Check if we should do calculations at this q
-         * If q is odd we should be doing FEP
-         * If q < 2 we should be doing electrostatic PME
-         * If q >= 2 we should be doing LJ-PME
+        /* Check if we should do calculations at this grid_index
+         * If grid_index is odd we should be doing FEP
+         * If grid_index < 2 we should be doing electrostatic PME
+         * If grid_index >= 2 we should be doing LJ-PME
           */
-        if ((!pme->bFEP_q && q == 1)
-            || (!pme->bFEP_lj && q == 3)
-            || (!(flags & GMX_PME_DO_COULOMB) && q < 2)
-            || (!(flags & GMX_PME_DO_LJ) && q >= 2))
+        if ((grid_index <  DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
+                                    (grid_index == 1 && !pme->bFEP_q))) ||
+            (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
+                                    (grid_index == 3 && !pme->bFEP_lj))))
          {
              continue;
          }
          /* Unpack structure */
-        pmegrid    = &pme->pmegrid[q];
-        fftgrid    = pme->fftgrid[q];
-        cfftgrid   = pme->cfftgrid[q];
-        pfft_setup = pme->pfft_setup[q];
-        switch (q)
+        pmegrid    = &pme->pmegrid[grid_index];
+        fftgrid    = pme->fftgrid[grid_index];
+        cfftgrid   = pme->cfftgrid[grid_index];
+        pfft_setup = pme->pfft_setup[grid_index];
+        switch (grid_index)
          {
-            case 0: charge = chargeA + start; break;
-            case 1: charge = chargeB + start; break;
-            case 2: charge = c6A + start; break;
-            case 3: charge = c6B + start; break;
+            case 0: coefficient = chargeA + start; break;
+            case 1: coefficient = chargeB + start; break;
+            case 2: coefficient = c6A + start; break;
+            case 3: coefficient = c6B + start; break;
          }
  
          grid = pmegrid->grid.grid;
  
          if (debug)
          {
-            fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+            fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
                      cr->nnodes, cr->nodeid);
              fprintf(debug, "Grid = %p\n", (void*)grid);
              if (grid == NULL)
@@ -4923,12 +4844,12 @@ int gmx_pme_do(gmx_pme_t pme,
  
          if (pme->nnodes == 1)
          {
-            atc->q = charge;
+            atc->coefficient = coefficient;
          }
          else
          {
              wallcycle_start(wcycle, ewcPME_REDISTXF);
-            do_redist_x_q(pme, cr, start, homenr, bFirst, x, charge);
+            do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
              where();
  
              wallcycle_stop(wcycle, ewcPME_REDISTXF);
@@ -4936,22 +4857,22 @@ int gmx_pme_do(gmx_pme_t pme,
  
          if (debug)
          {
-            fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+            fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
                      cr->nodeid, atc->n);
          }
  
-        if (flags & GMX_PME_SPREAD_Q)
+        if (flags & GMX_PME_SPREAD)
          {
              wallcycle_start(wcycle, ewcPME_SPREADGATHER);
  
-            /* Spread the charges on a grid */
-            spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines);
+            /* Spread the coefficients on a grid */
+            spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
  
              if (bFirst)
              {
                  inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
              }
-            inc_nrnb(nrnb, eNR_SPREADQBSP,
+            inc_nrnb(nrnb, eNR_SPREADBSP,
                       pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
  
              if (!pme->bUseThreads)
@@ -4962,12 +4883,12 @@ int gmx_pme_do(gmx_pme_t pme,
  #ifdef GMX_MPI
                  if (pme->nnodes > 1)
                  {
-                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
+                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
                      where();
                  }
  #endif
  
-                copy_pmegrid_to_fftgrid(pme, grid, fftgrid, q);
+                copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
              }
  
              wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
@@ -5002,9 +4923,9 @@ int gmx_pme_do(gmx_pme_t pme,
                  /* solve in k-space for our local cells */
                  if (thread == 0)
                  {
-                    wallcycle_start(wcycle, (q < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+                    wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
                  }
-                if (q < DO_Q)
+                if (grid_index < DO_Q)
                  {
                      loop_count =
                          solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
@@ -5023,7 +4944,7 @@ int gmx_pme_do(gmx_pme_t pme,
  
                  if (thread == 0)
                  {
-                    wallcycle_stop(wcycle, (q < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+                    wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
                      where();
                      inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
                  }
@@ -5052,10 +4973,13 @@ int gmx_pme_do(gmx_pme_t pme,
                          inc_nrnb(nrnb, eNR_FFT, 2*npme);
                      }
  
+                    /* Note: this wallcycle region is closed below
+                       outside an OpenMP region, so take care if
+                       refactoring code here. */
                      wallcycle_start(wcycle, ewcPME_SPREADGATHER);
                  }
  
-                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, q, pme->nthread, thread);
+                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
              }
          }
          /* End of thread parallel section.
@@ -5068,7 +4992,7 @@ int gmx_pme_do(gmx_pme_t pme,
  #ifdef GMX_MPI
              if (pme->nnodes > 1)
              {
-                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
+                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
              }
  #endif
              where();
@@ -5083,20 +5007,22 @@ int gmx_pme_do(gmx_pme_t pme,
               * atc->f is the actual force array, not a buffer,
               * therefore we should not clear it.
               */
-            lambda  = q < DO_Q ? lambda_q : lambda_lj;
+            lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
              bClearF = (bFirst && PAR(cr));
  #pragma omp parallel for num_threads(pme->nthread) schedule(static)
              for (thread = 0; thread < pme->nthread; thread++)
              {
                  gather_f_bsplines(pme, grid, bClearF, atc,
                                    &atc->spline[thread],
-                                  pme->bFEP ? (q % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
+                                  pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
              }
  
              where();
  
              inc_nrnb(nrnb, eNR_GATHERFBSP,
                       pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+            /* Note: this wallcycle region is opened above inside an OpenMP
+               region, so take care if refactoring code here. */
              wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
          }
  
@@ -5105,244 +5031,269 @@ int gmx_pme_do(gmx_pme_t pme,
              /* This should only be called on the master thread
               * and after the threads have synchronized.
               */
-            if (q < 2)
+            if (grid_index < 2)
              {
-                get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+                get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
              }
              else
              {
-                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
              }
          }
          bFirst = FALSE;
-    } /* of q-loop */
+    } /* of grid_index-loop */
  
      /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
       * seven terms. */
  
-    if (flags & GMX_PME_LJ_LB)
+    if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
      {
-        real *local_c6 = NULL, *local_sigma = NULL;
-        if (pme->nnodes == 1)
-        {
-            if (pme->lb_buf1 == NULL)
-            {
-                pme->lb_buf_nalloc = pme->atc[0].n;
-                snew(pme->lb_buf1, pme->lb_buf_nalloc);
-            }
-            pme->atc[0].q = pme->lb_buf1;
-            local_c6      = c6A;
-            local_sigma   = sigmaA;
-        }
-        else
+        /* Loop over A- and B-state if we are doing FEP */
+        for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
          {
-            atc = &pme->atc[0];
-
-            wallcycle_start(wcycle, ewcPME_REDISTXF);
-
-            do_redist_x_q(pme, cr, start, homenr, bFirst, x, c6A);
-            if (pme->lb_buf_nalloc < atc->n)
+            real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
+            if (pme->nnodes == 1)
              {
-                pme->lb_buf_nalloc = atc->nalloc;
-                srenew(pme->lb_buf1, pme->lb_buf_nalloc);
-                srenew(pme->lb_buf2, pme->lb_buf_nalloc);
-            }
-            local_c6 = pme->lb_buf1;
-            for (i = 0; i < atc->n; ++i)
-            {
-                local_c6[i] = atc->q[i];
-            }
-            where();
-
-            do_redist_x_q(pme, cr, start, homenr, FALSE, x, sigmaA);
-            local_sigma = pme->lb_buf2;
-            for (i = 0; i < atc->n; ++i)
-            {
-                local_sigma[i] = atc->q[i];
+                if (pme->lb_buf1 == NULL)
+                {
+                    pme->lb_buf_nalloc = pme->atc[0].n;
+                    snew(pme->lb_buf1, pme->lb_buf_nalloc);
+                }
+                pme->atc[0].coefficient = pme->lb_buf1;
+                switch (fep_state)
+                {
+                    case 0:
+                        local_c6      = c6A;
+                        local_sigma   = sigmaA;
+                        break;
+                    case 1:
+                        local_c6      = c6B;
+                        local_sigma   = sigmaB;
+                        break;
+                    default:
+                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+                }
              }
-            where();
-
-            wallcycle_stop(wcycle, ewcPME_REDISTXF);
-        }
-        calc_initial_lb_coeffs(pme, local_c6, local_sigma);
-
-        /*Seven terms in LJ-PME with LB, q < 2 reserved for electrostatics*/
-        for (q = 2; q < 9; ++q)
-        {
-            /* Unpack structure */
-            pmegrid    = &pme->pmegrid[q];
-            fftgrid    = pme->fftgrid[q];
-            cfftgrid   = pme->cfftgrid[q];
-            pfft_setup = pme->pfft_setup[q];
-            calc_next_lb_coeffs(pme, local_sigma);
-            grid = pmegrid->grid.grid;
-            where();
-
-            if (flags & GMX_PME_SPREAD_Q)
+            else
              {
-                wallcycle_start(wcycle, ewcPME_SPREADGATHER);
-                /* Spread the charges on a grid */
-                spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines);
+                atc = &pme->atc[0];
+                switch (fep_state)
+                {
+                    case 0:
+                        RedistC6      = c6A;
+                        RedistSigma   = sigmaA;
+                        break;
+                    case 1:
+                        RedistC6      = c6B;
+                        RedistSigma   = sigmaB;
+                        break;
+                    default:
+                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+                }
+                wallcycle_start(wcycle, ewcPME_REDISTXF);
  
-                if (bFirst)
+                do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
+                if (pme->lb_buf_nalloc < atc->n)
                  {
-                    inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+                    pme->lb_buf_nalloc = atc->nalloc;
+                    srenew(pme->lb_buf1, pme->lb_buf_nalloc);
+                    srenew(pme->lb_buf2, pme->lb_buf_nalloc);
                  }
-                inc_nrnb(nrnb, eNR_SPREADQBSP,
-                         pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
-                if (pme->nthread == 1)
+                local_c6 = pme->lb_buf1;
+                for (i = 0; i < atc->n; ++i)
                  {
-                    wrap_periodic_pmegrid(pme, grid);
+                    local_c6[i] = atc->coefficient[i];
+                }
+                where();
  
-                    /* sum contributions to local grid from other nodes */
-#ifdef GMX_MPI
-                    if (pme->nnodes > 1)
-                    {
-                        gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
-                        where();
-                    }
-#endif
-                    copy_pmegrid_to_fftgrid(pme, grid, fftgrid, q);
+                do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
+                local_sigma = pme->lb_buf2;
+                for (i = 0; i < atc->n; ++i)
+                {
+                    local_sigma[i] = atc->coefficient[i];
                  }
+                where();
  
-                wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+                wallcycle_stop(wcycle, ewcPME_REDISTXF);
              }
+            calc_initial_lb_coeffs(pme, local_c6, local_sigma);
  
-            /*Here we start a large thread parallel region*/
-#pragma omp parallel num_threads(pme->nthread) private(thread)
+            /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
+            for (grid_index = 2; grid_index < 9; ++grid_index)
              {
-                thread = gmx_omp_get_thread_num();
-                if (flags & GMX_PME_SOLVE)
+                /* Unpack structure */
+                pmegrid    = &pme->pmegrid[grid_index];
+                fftgrid    = pme->fftgrid[grid_index];
+                cfftgrid   = pme->cfftgrid[grid_index];
+                pfft_setup = pme->pfft_setup[grid_index];
+                calc_next_lb_coeffs(pme, local_sigma);
+                grid = pmegrid->grid.grid;
+                where();
+
+                if (flags & GMX_PME_SPREAD)
                  {
-                    /* do 3d-fft */
-                    if (thread == 0)
+                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+                    /* Spread the c6 on a grid */
+                    spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
+
+                    if (bFirst)
                      {
-                        wallcycle_start(wcycle, ewcPME_FFT);
+                        inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
                      }
  
-                    gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
-                                               thread, wcycle);
-                    if (thread == 0)
+                    inc_nrnb(nrnb, eNR_SPREADBSP,
+                             pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+                    if (pme->nthread == 1)
                      {
-                        wallcycle_stop(wcycle, ewcPME_FFT);
+                        wrap_periodic_pmegrid(pme, grid);
+                        /* sum contributions to local grid from other nodes */
+#ifdef GMX_MPI
+                        if (pme->nnodes > 1)
+                        {
+                            gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
+                            where();
+                        }
+#endif
+                        copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
                      }
-                    where();
+                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
                  }
-            }
-            bFirst = FALSE;
-        }
-        if (flags & GMX_PME_SOLVE)
-        {
-            int loop_count;
-            /* solve in k-space for our local cells */
+                /*Here we start a large thread parallel region*/
  #pragma omp parallel num_threads(pme->nthread) private(thread)
-            {
-                thread = gmx_omp_get_thread_num();
-                if (thread == 0)
                  {
-                    wallcycle_start(wcycle, ewcLJPME);
-                }
+                    thread = gmx_omp_get_thread_num();
+                    if (flags & GMX_PME_SOLVE)
+                    {
+                        /* do 3d-fft */
+                        if (thread == 0)
+                        {
+                            wallcycle_start(wcycle, ewcPME_FFT);
+                        }
  
-                loop_count =
-                    solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
-                                     box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
-                                     bCalcEnerVir,
-                                     pme->nthread, thread);
-                if (thread == 0)
-                {
-                    wallcycle_stop(wcycle, ewcLJPME);
-                    where();
-                    inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+                                                   thread, wcycle);
+                        if (thread == 0)
+                        {
+                            wallcycle_stop(wcycle, ewcPME_FFT);
+                        }
+                        where();
+                    }
                  }
+                bFirst = FALSE;
              }
-        }
-
-        if (bCalcEnerVir)
-        {
-            /* This should only be called on the master thread and
-             * after the threads have synchronized.
-             */
-            get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2], vir_AB[2]);
-        }
-
-        if (bCalcF)
-        {
-            bFirst = !(flags & GMX_PME_DO_COULOMB);
-            calc_initial_lb_coeffs(pme, local_c6, local_sigma);
-            for (q = 8; q >= 2; --q)
+            if (flags & GMX_PME_SOLVE)
              {
-                /* Unpack structure */
-                pmegrid    = &pme->pmegrid[q];
-                fftgrid    = pme->fftgrid[q];
-                cfftgrid   = pme->cfftgrid[q];
-                pfft_setup = pme->pfft_setup[q];
-                grid       = pmegrid->grid.grid;
-                calc_next_lb_coeffs(pme, local_sigma);
-                where();
+                /* solve in k-space for our local cells */
  #pragma omp parallel num_threads(pme->nthread) private(thread)
                  {
+                    int loop_count;
                      thread = gmx_omp_get_thread_num();
-                    /* do 3d-invfft */
                      if (thread == 0)
                      {
-                        where();
-                        wallcycle_start(wcycle, ewcPME_FFT);
+                        wallcycle_start(wcycle, ewcLJPME);
                      }
  
-                    gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
-                                               thread, wcycle);
+                    loop_count =
+                        solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
+                                         box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                         bCalcEnerVir,
+                                         pme->nthread, thread);
                      if (thread == 0)
                      {
-                        wallcycle_stop(wcycle, ewcPME_FFT);
-
+                        wallcycle_stop(wcycle, ewcLJPME);
                          where();
+                        inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+                    }
+                }
+            }
  
-                        if (pme->nodeid == 0)
+            if (bCalcEnerVir)
+            {
+                /* This should only be called on the master thread and
+                 * after the threads have synchronized.
+                 */
+                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
+            }
+
+            if (bCalcF)
+            {
+                bFirst = !(flags & GMX_PME_DO_COULOMB);
+                calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+                for (grid_index = 8; grid_index >= 2; --grid_index)
+                {
+                    /* Unpack structure */
+                    pmegrid    = &pme->pmegrid[grid_index];
+                    fftgrid    = pme->fftgrid[grid_index];
+                    cfftgrid   = pme->cfftgrid[grid_index];
+                    pfft_setup = pme->pfft_setup[grid_index];
+                    grid       = pmegrid->grid.grid;
+                    calc_next_lb_coeffs(pme, local_sigma);
+                    where();
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+                    {
+                        thread = gmx_omp_get_thread_num();
+                        /* do 3d-invfft */
+                        if (thread == 0)
                          {
-                            ntot  = pme->nkx*pme->nky*pme->nkz;
-                            npme  = ntot*log((real)ntot)/log(2.0);
-                            inc_nrnb(nrnb, eNR_FFT, 2*npme);
+                            where();
+                            wallcycle_start(wcycle, ewcPME_FFT);
                          }
-                        wallcycle_start(wcycle, ewcPME_SPREADGATHER);
-                    }
  
-                    copy_fftgrid_to_pmegrid(pme, fftgrid, grid, q, pme->nthread, thread);
+                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+                                                   thread, wcycle);
+                        if (thread == 0)
+                        {
+                            wallcycle_stop(wcycle, ewcPME_FFT);
+
+                            where();
  
-                } /*#pragma omp parallel*/
+                            if (pme->nodeid == 0)
+                            {
+                                ntot  = pme->nkx*pme->nky*pme->nkz;
+                                npme  = ntot*log((real)ntot)/log(2.0);
+                                inc_nrnb(nrnb, eNR_FFT, 2*npme);
+                            }
+                            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+                        }
  
-                /* distribute local grid to all nodes */
+                        copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
+
+                    } /*#pragma omp parallel*/
+
+                    /* distribute local grid to all nodes */
  #ifdef GMX_MPI
-                if (pme->nnodes > 1)
-                {
-                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
-                }
+                    if (pme->nnodes > 1)
+                    {
+                        gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
+                    }
  #endif
-                where();
+                    where();
  
-                unwrap_periodic_pmegrid(pme, grid);
+                    unwrap_periodic_pmegrid(pme, grid);
  
-                /* interpolate forces for our local atoms */
-                where();
-                bClearF = (bFirst && PAR(cr));
-                scale   = pme->bFEP ? (q < 9 ? 1.0-lambda_lj : lambda_lj) : 1.0;
-                scale  *= lb_scale_factor[q-2];
+                    /* interpolate forces for our local atoms */
+                    where();
+                    bClearF = (bFirst && PAR(cr));
+                    scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
+                    scale  *= lb_scale_factor[grid_index-2];
  #pragma omp parallel for num_threads(pme->nthread) schedule(static)
-                for (thread = 0; thread < pme->nthread; thread++)
-                {
-                    gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
-                                      &pme->atc[0].spline[thread],
-                                      scale);
-                }
-                where();
+                    for (thread = 0; thread < pme->nthread; thread++)
+                    {
+                        gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
+                                          &pme->atc[0].spline[thread],
+                                          scale);
+                    }
+                    where();
  
-                inc_nrnb(nrnb, eNR_GATHERFBSP,
-                         pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
-                wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+                    inc_nrnb(nrnb, eNR_GATHERFBSP,
+                             pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
  
-                bFirst = FALSE;
-            } /*for (q = 8; q >= 2; --q)*/
-        }     /*if (bCalcF)*/
-    }         /*if (flags & GMX_PME_LJ_LB)*/
+                    bFirst = FALSE;
+                } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
+            }     /* if (bCalcF) */
+        }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
+    }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
  
      if (bCalcF && pme->nnodes > 1)
      {
@@ -5365,10 +5316,6 @@ int gmx_pme_do(gmx_pme_t pme,
                  dd_pmeredist_f(pme, atc, n_d, f_d,
                                 d == pme->ndecompdim-1 && pme->bPPnode);
              }
-            else
-            {
-                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
-            }
          }
  
          wallcycle_stop(wcycle, ewcPME_REDISTXF);