Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / mdlib / pme.c
index 503e1b449e45ca34add3696e0037b58e9d9aa91d..67a12ab58db8d0a8f5b4dc4a2f53f75214da1568 100644 (file)
@@ -1,37 +1,38 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
  *
- *
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
- *
- *          GROningen MAchine for Chemical Simulations
- *
- *                        VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
  * of the License, or (at your option) any later version.
  *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  *
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
  *
- * And Hey:
- * GROwing Monsters And Cloning Shrimps
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
  */
 /* IMPORTANT FOR DEVELOPERS:
  *
  * /Erik 001109
  */
 
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
 
-#include "gromacs/fft/parallel_3dfft.h"
-#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/legacyheaders/pme.h"
 
+#include "config.h"
+
+#include <assert.h>
+#include <math.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-#include "txtdump.h"
-#include "vec.h"
-#include "gmxcomplex.h"
-#include "smalloc.h"
-#include "futil.h"
-#include "coulomb.h"
-#include "gmx_fatal.h"
-#include "pme.h"
-#include "network.h"
-#include "physics.h"
-#include "nrnb.h"
-#include "gmx_wallcycle.h"
-#include "pdbio.h"
-#include "gmx_cyclecounter.h"
-#include "gmx_omp.h"
-#include "macros.h"
 
+#include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/timing/cyclecounter.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
 
 /* Include the SIMD macro file and then check for support */
-#include "gmx_simd_macros.h"
-#if defined GMX_HAVE_SIMD_MACROS && defined GMX_SIMD_HAVE_EXP
-/* Turn on SIMD intrinsics for PME solve */
-#define PME_SIMD
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#ifdef GMX_SIMD_HAVE_REAL
+/* Turn on arbitrary width SIMD intrinsics for PME solve */
+#    define PME_SIMD_SOLVE
 #endif
 
-/* SIMD spread+gather only in single precision with SSE2 or higher available.
- * We might want to switch to use gmx_simd_macros.h, but this is somewhat
- * complicated, as we use unaligned and/or 4-wide only loads.
+#define PME_GRID_QA    0 /* Gridindex for A-state for Q */
+#define PME_GRID_C6A   2 /* Gridindex for A-state for LJ */
+#define DO_Q           2 /* Electrostatic grids have index q<2 */
+#define DO_Q_AND_LJ    4 /* non-LB LJ grids have index 2 <= q < 4 */
+#define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
+
+/* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
+const real lb_scale_factor[] = {
+    1.0/64, 6.0/64, 15.0/64, 20.0/64,
+    15.0/64, 6.0/64, 1.0/64
+};
+
+/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
+const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
+
+/* Check if we have 4-wide SIMD macro support */
+#if (defined GMX_SIMD4_HAVE_REAL)
+/* Do PME spread and gather with 4-wide SIMD.
+ * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
  */
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
-#define PME_SSE_SPREAD_GATHER
-#include <emmintrin.h>
-/* Some old AMD processors could have problems with unaligned loads+stores */
-#ifndef GMX_FAHCORE
-#define PME_SSE_UNALIGNED
-#endif
+#    define PME_SIMD4_SPREAD_GATHER
+
+#    if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
+/* With PME-order=4 on x86, unaligned load+store is slightly faster
+ * than doubling all SIMD operations when using aligned load+store.
+ */
+#        define PME_SIMD4_UNALIGNED
+#    endif
 #endif
 
 #define DFT_TOL 1e-7
 #define mpi_type MPI_FLOAT
 #endif
 
-/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+#ifdef PME_SIMD4_SPREAD_GATHER
+#    define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
+#else
+/* We can use any alignment, apart from 0, so we use 4 reals */
+#    define SIMD4_ALIGNMENT  (4*sizeof(real))
+#endif
+
+/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
+ * to preserve alignment.
+ */
 #define GMX_CACHE_SEP 64
 
 /* We only define a maximum to be able to use local arrays without allocation.
@@ -193,16 +223,16 @@ typedef struct {
     int      n;
     int      nalloc;
     rvec    *x;
-    real    *q;
+    real    *coefficient;
     rvec    *f;
     gmx_bool bSpread;       /* These coordinates are used for spreading */
     int      pme_order;
     ivec    *idx;
-    rvec    *fractx;            /* Fractional coordinate relative to the
-                                 * lower cell boundary
+    rvec    *fractx;            /* Fractional coordinate relative to
+                                 * the lower cell boundary
                                  */
     int             nthread;
-    int            *thread_idx; /* Which thread should spread which charge */
+    int            *thread_idx; /* Which thread should spread which coefficient */
     thread_plist_t *thread_plist;
     splinedata_t   *spline;
 } pme_atomcomm_t;
@@ -229,13 +259,12 @@ typedef struct {
     ivec       nthread_comm; /* The number of threads to communicate with        */
 } pmegrids_t;
 
-
 typedef struct {
-#ifdef PME_SSE_SPREAD_GATHER
-    /* Masks for SSE aligned spreading and gathering */
-    __m128 mask_SSE0[6], mask_SSE1[6];
+#ifdef PME_SIMD4_SPREAD_GATHER
+    /* Masks for 4-wide SIMD aligned spreading and gathering */
+    gmx_simd4_bool_t mask_S0[6], mask_S1[6];
 #else
-    int    dummy; /* C89 requires that struct has at least one member */
+    int              dummy; /* C89 requires that struct has at least one member */
 #endif
 } pme_spline_work_t;
 
@@ -249,11 +278,14 @@ typedef struct {
     real *   denom;
     real *   tmp1_alloc;
     real *   tmp1;
+    real *   tmp2;
     real *   eterm;
     real *   m2inv;
 
-    real     energy;
-    matrix   vir;
+    real     energy_q;
+    matrix   vir_q;
+    real     energy_lj;
+    matrix   vir_lj;
 } pme_work_t;
 
 typedef struct gmx_pme {
@@ -276,14 +308,27 @@ typedef struct gmx_pme {
 
     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
     gmx_bool   bFEP;          /* Compute Free energy contribution */
+    gmx_bool   bFEP_q;
+    gmx_bool   bFEP_lj;
     int        nkx, nky, nkz; /* Grid dimensions */
     gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
     int        pme_order;
     real       epsilon_r;
 
-    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
-    pmegrids_t pmegridB;
-    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
+    int        ljpme_combination_rule;  /* Type of combination rule in LJ-PME */
+
+    int        ngrids;                  /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
+
+    pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
+                                         * includes overlap Grid indices are ordered as
+                                         * follows:
+                                         * 0: Coloumb PME, state A
+                                         * 1: Coloumb PME, state B
+                                         * 2-8: LJ-PME
+                                         * This can probably be done in a better way
+                                         * but this simple hack works for now
+                                         */
+    /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
     int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
     /* pmegrid_nz might be larger than strictly necessary to ensure
      * memory alignment, pmegrid_nz_base gives the real base size.
@@ -293,56 +338,50 @@ typedef struct gmx_pme {
     int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 
     /* Work data for spreading and gathering */
-    pme_spline_work_t    *spline_work;
+    pme_spline_work_t     *spline_work;
+
+    real                 **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
+    /* inside the interpolation grid, but separate for 2D PME decomp. */
+    int                    fftgrid_nx, fftgrid_ny, fftgrid_nz;
 
-    real                 *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
-    real                 *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
-    int                   fftgrid_nx, fftgrid_ny, fftgrid_nz;
+    t_complex            **cfftgrid;  /* Grids for complex FFT data */
 
-    t_complex            *cfftgridA;  /* Grids for complex FFT data */
-    t_complex            *cfftgridB;
-    int                   cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
+    int                    cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 
-    gmx_parallel_3dfft_t  pfft_setupA;
-    gmx_parallel_3dfft_t  pfft_setupB;
+    gmx_parallel_3dfft_t  *pfft_setup;
 
-    int                  *nnx, *nny, *nnz;
-    real                 *fshx, *fshy, *fshz;
+    int                   *nnx, *nny, *nnz;
+    real                  *fshx, *fshy, *fshz;
 
-    pme_atomcomm_t        atc[2]; /* Indexed on decomposition index */
-    matrix                recipbox;
-    splinevec             bsp_mod;
+    pme_atomcomm_t         atc[2]; /* Indexed on decomposition index */
+    matrix                 recipbox;
+    splinevec              bsp_mod;
+    /* Buffers to store data for local atoms for L-B combination rule
+     * calculations in LJ-PME. lb_buf1 stores either the coefficients
+     * for spreading/gathering (in serial), or the C6 coefficient for
+     * local atoms (in parallel).  lb_buf2 is only used in parallel,
+     * and stores the sigma values for local atoms. */
+    real                 *lb_buf1, *lb_buf2;
+    int                   lb_buf_nalloc; /* Allocation size for the above buffers. */
 
-    pme_overlap_t         overlap[2]; /* Indexed on dimension, 0=x, 1=y */
+    pme_overlap_t         overlap[2];    /* Indexed on dimension, 0=x, 1=y */
 
-    pme_atomcomm_t        atc_energy; /* Only for gmx_pme_calc_energy */
+    pme_atomcomm_t        atc_energy;    /* Only for gmx_pme_calc_energy */
 
-    rvec                 *bufv;       /* Communication buffer */
-    real                 *bufr;       /* Communication buffer */
-    int                   buf_nalloc; /* The communication buffer size */
+    rvec                 *bufv;          /* Communication buffer */
+    real                 *bufr;          /* Communication buffer */
+    int                   buf_nalloc;    /* The communication buffer size */
 
     /* thread local work data for solve_pme */
     pme_work_t *work;
 
-    /* Work data for PME_redist */
-    gmx_bool redist_init;
-    int *    scounts;
-    int *    rcounts;
-    int *    sdispls;
-    int *    rdispls;
-    int *    sidx;
-    int *    idxa;
-    real *   redist_buf;
-    int      redist_buf_nalloc;
-
     /* Work data for sum_qgrid */
     real *   sum_qgrid_tmp;
     real *   sum_qgrid_dd_tmp;
 } t_gmx_pme;
 
-
 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
-                                   int start, int end, int thread)
+                                   int start, int grid_index, int end, int thread)
 {
     int             i;
     int            *idxptr, tix, tiy, tiz;
@@ -372,9 +411,9 @@ static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
     rzy = pme->recipbox[ZZ][YY];
     rzz = pme->recipbox[ZZ][ZZ];
 
-    g2tx = pme->pmegridA.g2t[XX];
-    g2ty = pme->pmegridA.g2t[YY];
-    g2tz = pme->pmegridA.g2t[ZZ];
+    g2tx = pme->pmegrid[grid_index].g2t[XX];
+    g2ty = pme->pmegrid[grid_index].g2t[YY];
+    g2tz = pme->pmegrid[grid_index].g2t[ZZ];
 
     bThreads = (atc->nthread > 1);
     if (bThreads)
@@ -581,8 +620,9 @@ static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 
     srenew(th[XX], nalloc);
     srenew(th[YY], nalloc);
-    /* In z we add padding, this is only required for the aligned SSE code */
-    srenew(*ptr_z, nalloc+2*padding);
+    /* In z we add padding, this is only required for the aligned SIMD code */
+    sfree_aligned(*ptr_z);
+    snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
     th[ZZ] = *ptr_z + padding;
 
     for (i = 0; i < padding; i++)
@@ -624,7 +664,7 @@ static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
         if (atc->nslab > 1)
         {
             srenew(atc->x, atc->nalloc);
-            srenew(atc->q, atc->nalloc);
+            srenew(atc->coefficient, atc->nalloc);
             srenew(atc->f, atc->nalloc);
             for (i = nalloc_old; i < atc->nalloc; i++)
             {
@@ -649,121 +689,10 @@ static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
     }
 }
 
-static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
-                         int n, gmx_bool bXF, rvec *x_f, real *charge,
-                         pme_atomcomm_t *atc)
-/* Redistribute particle data for PME calculation */
-/* domain decomposition by x coordinate           */
-{
-    int *idxa;
-    int  i, ii;
-
-    if (FALSE == pme->redist_init)
-    {
-        snew(pme->scounts, atc->nslab);
-        snew(pme->rcounts, atc->nslab);
-        snew(pme->sdispls, atc->nslab);
-        snew(pme->rdispls, atc->nslab);
-        snew(pme->sidx, atc->nslab);
-        pme->redist_init = TRUE;
-    }
-    if (n > pme->redist_buf_nalloc)
-    {
-        pme->redist_buf_nalloc = over_alloc_dd(n);
-        srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
-    }
-
-    pme->idxa = atc->pd;
-
-#ifdef GMX_MPI
-    if (forw && bXF)
-    {
-        /* forward, redistribution from pp to pme */
-
-        /* Calculate send counts and exchange them with other nodes */
-        for (i = 0; (i < atc->nslab); i++)
-        {
-            pme->scounts[i] = 0;
-        }
-        for (i = 0; (i < n); i++)
-        {
-            pme->scounts[pme->idxa[i]]++;
-        }
-        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-
-        /* Calculate send and receive displacements and index into send
-           buffer */
-        pme->sdispls[0] = 0;
-        pme->rdispls[0] = 0;
-        pme->sidx[0]    = 0;
-        for (i = 1; i < atc->nslab; i++)
-        {
-            pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
-            pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
-            pme->sidx[i]    = pme->sdispls[i];
-        }
-        /* Total # of particles to be received */
-        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-
-        pme_realloc_atomcomm_things(atc);
-
-        /* Copy particle coordinates into send buffer and exchange*/
-        for (i = 0; (i < n); i++)
-        {
-            ii = DIM*pme->sidx[pme->idxa[i]];
-            pme->sidx[pme->idxa[i]]++;
-            pme->redist_buf[ii+XX] = x_f[i][XX];
-            pme->redist_buf[ii+YY] = x_f[i][YY];
-            pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
-        }
-        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
-                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
-                      pme->rvec_mpi, atc->mpi_comm);
-    }
-    if (forw)
-    {
-        /* Copy charge into send buffer and exchange*/
-        for (i = 0; i < atc->nslab; i++)
-        {
-            pme->sidx[i] = pme->sdispls[i];
-        }
-        for (i = 0; (i < n); i++)
-        {
-            ii = pme->sidx[pme->idxa[i]];
-            pme->sidx[pme->idxa[i]]++;
-            pme->redist_buf[ii] = charge[i];
-        }
-        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
-                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
-                      atc->mpi_comm);
-    }
-    else   /* backward, redistribution from pme to pp */
-    {
-        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
-                      pme->redist_buf, pme->scounts, pme->sdispls,
-                      pme->rvec_mpi, atc->mpi_comm);
-
-        /* Copy data from receive buffer */
-        for (i = 0; i < atc->nslab; i++)
-        {
-            pme->sidx[i] = pme->sdispls[i];
-        }
-        for (i = 0; (i < n); i++)
-        {
-            ii          = DIM*pme->sidx[pme->idxa[i]];
-            x_f[i][XX] += pme->redist_buf[ii+XX];
-            x_f[i][YY] += pme->redist_buf[ii+YY];
-            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
-            pme->sidx[pme->idxa[i]]++;
-        }
-    }
-#endif
-}
-
-static void pme_dd_sendrecv(pme_atomcomm_t *atc,
-                            gmx_bool bBackward, int shift,
-                            void *buf_s, int nbyte_s,
-                            void *buf_r, int nbyte_r)
+static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
+                            gmx_bool gmx_unused bBackward, int gmx_unused shift,
+                            void gmx_unused *buf_s, int gmx_unused nbyte_s,
+                            void gmx_unused *buf_r, int gmx_unused nbyte_r)
 {
 #ifdef GMX_MPI
     int        dest, src;
@@ -803,9 +732,9 @@ static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 #endif
 }
 
-static void dd_pmeredist_x_q(gmx_pme_t pme,
-                             int n, gmx_bool bX, rvec *x, real *charge,
-                             pme_atomcomm_t *atc)
+static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
+                                    int n, gmx_bool bX, rvec *x, real *data,
+                                    pme_atomcomm_t *atc)
 {
     int *commnode, *buf_index;
     int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
@@ -825,7 +754,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
     {
         if (atc->count[atc->nodeid] + nsend != n)
         {
-            gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+            gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
                       "This usually means that your system is not well equilibrated.",
                       n - (atc->count[atc->nodeid] + nsend),
                       pme->nodeid, 'x'+atc->dimind);
@@ -845,7 +774,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
             /* Communicate the count */
             if (debug)
             {
-                fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+                fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
                         atc->dimind, atc->nodeid, commnode[i], scount);
             }
             pme_dd_sendrecv(atc, FALSE, i,
@@ -868,7 +797,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
             {
                 copy_rvec(x[i], atc->x[local_pos]);
             }
-            atc->q[local_pos] = charge[i];
+            atc->coefficient[local_pos] = data[i];
             local_pos++;
         }
         else
@@ -878,7 +807,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
             {
                 copy_rvec(x[i], pme->bufv[buf_index[node]]);
             }
-            pme->bufr[buf_index[node]] = charge[i];
+            pme->bufr[buf_index[node]] = data[i];
             buf_index[node]++;
         }
     }
@@ -897,10 +826,10 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
                                 pme->bufv[buf_pos], scount*sizeof(rvec),
                                 atc->x[local_pos], rcount*sizeof(rvec));
             }
-            /* Communicate the charges */
+            /* Communicate the coefficients */
             pme_dd_sendrecv(atc, FALSE, i,
                             pme->bufr+buf_pos, scount*sizeof(real),
-                            atc->q+local_pos, rcount*sizeof(real));
+                            atc->coefficient+local_pos, rcount*sizeof(real));
             buf_pos   += scount;
             local_pos += atc->rcount[i];
         }
@@ -979,8 +908,7 @@ static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 }
 
 #ifdef GMX_MPI
-static void
-gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
+static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 {
     pme_overlap_t *overlap;
     int            send_index0, send_nindex;
@@ -999,7 +927,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Since we have already (un)wrapped the overlap in the z-dimension,
          * we only have to communicate 0 to nkz (not pmegrid_nz).
          */
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
         {
             send_id       = overlap->send_id[ipulse];
             recv_id       = overlap->recv_id[ipulse];
@@ -1021,7 +949,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Copy data to contiguous send buffer */
         if (debug)
         {
-            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, send_id,
                     pme->pmegrid_start_iy,
                     send_index0-pme->pmegrid_start_iy,
@@ -1053,7 +981,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Get data from contiguous recv buffer */
         if (debug)
         {
-            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, recv_id,
                     pme->pmegrid_start_iy,
                     recv_index0-pme->pmegrid_start_iy,
@@ -1069,7 +997,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                 for (k = 0; k < pme->nkz; k++)
                 {
                     iz = k;
-                    if (direction == GMX_SUM_QGRID_FORWARD)
+                    if (direction == GMX_SUM_GRID_FORWARD)
                     {
                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
                     }
@@ -1091,7 +1019,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 
     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
     {
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
         {
             send_id       = overlap->send_id[ipulse];
             recv_id       = overlap->recv_id[ipulse];
@@ -1117,12 +1045,12 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 
         if (debug)
         {
-            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, send_id,
                     pme->pmegrid_start_ix,
                     send_index0-pme->pmegrid_start_ix,
                     send_index0-pme->pmegrid_start_ix+send_nindex);
-            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, recv_id,
                     pme->pmegrid_start_ix,
                     recv_index0-pme->pmegrid_start_ix,
@@ -1136,7 +1064,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                      overlap->mpi_comm, &stat);
 
         /* ADD data from contiguous recv buffer */
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
         {
             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
             for (i = 0; i < recv_nindex*datasize; i++)
@@ -1149,8 +1077,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 #endif
 
 
-static int
-copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
+static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
 {
     ivec    local_fft_ndata, local_fft_offset, local_fft_size;
     ivec    local_pme_size;
@@ -1158,7 +1085,7 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
     int     pmeidx, fftidx;
 
     /* Dimensions should be identical for A/B grid, so we just use A here */
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -1173,13 +1100,12 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
     {
 #ifdef DEBUG_PME
         FILE *fp, *fp2;
-        char  fn[STRLEN], format[STRLEN];
+        char  fn[STRLEN];
         real  val;
         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
-        fp = ffopen(fn, "w");
+        fp = gmx_ffopen(fn, "w");
         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
-        fp2 = ffopen(fn, "w");
-        sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
+        fp2 = gmx_ffopen(fn, "w");
 #endif
 
         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
@@ -1195,8 +1121,8 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
                     val = 100*pmegrid[pmeidx];
                     if (pmegrid[pmeidx] != 0)
                     {
-                        fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
-                                5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+                        gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
+                                                 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
                     }
                     if (pmegrid[pmeidx] != 0)
                     {
@@ -1212,8 +1138,8 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
             }
         }
 #ifdef DEBUG_PME
-        ffclose(fp);
-        ffclose(fp2);
+        gmx_ffclose(fp);
+        gmx_ffclose(fp2);
 #endif
     }
     return 0;
@@ -1231,9 +1157,8 @@ static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 }
 
 
-static int
-copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
-                        int nthread, int thread)
+static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
+                                   int nthread, int thread)
 {
     ivec          local_fft_ndata, local_fft_offset, local_fft_size;
     ivec          local_pme_size;
@@ -1249,7 +1174,7 @@ copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
     c1 = omp_cyc_start();
 #endif
     /* Dimensions should be identical for A/B grid, so we just use A here */
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -1291,8 +1216,7 @@ copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
 }
 
 
-static void
-wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 {
     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
 
@@ -1353,8 +1277,7 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 }
 
 
-static void
-unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 {
     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
 
@@ -1428,7 +1351,7 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     for (ithx = 0; (ithx < order); ithx++)                    \
     {                                                    \
         index_x = (i0+ithx)*pny*pnz;                     \
-        valx    = qn*thx[ithx];                          \
+        valx    = coefficient*thx[ithx];                          \
                                                      \
         for (ithy = 0; (ithy < order); ithy++)                \
         {                                                \
@@ -1444,23 +1367,30 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     }
 
 
-static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
-                                     pme_atomcomm_t *atc, splinedata_t *spline,
-                                     pme_spline_work_t *work)
+static void spread_coefficients_bsplines_thread(pmegrid_t                    *pmegrid,
+                                                pme_atomcomm_t               *atc,
+                                                splinedata_t                 *spline,
+                                                pme_spline_work_t gmx_unused *work)
 {
 
-    /* spread charges from home atoms to local grid */
+    /* spread coefficients from home atoms to local grid */
     real          *grid;
     pme_overlap_t *ol;
     int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
     int       *    idxptr;
     int            order, norder, index_x, index_xy, index_xyz;
-    real           valx, valxy, qn;
+    real           valx, valxy, coefficient;
     real          *thx, *thy, *thz;
     int            localsize, bndsize;
     int            pnx, pny, pnz, ndatatot;
     int            offx, offy, offz;
 
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+    real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
+
+    thz_aligned = gmx_simd4_align_r(thz_buffer);
+#endif
+
     pnx = pmegrid->s[XX];
     pny = pmegrid->s[YY];
     pnz = pmegrid->s[ZZ];
@@ -1480,10 +1410,10 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 
     for (nn = 0; nn < spline->n; nn++)
     {
-        n  = spline->ind[nn];
-        qn = atc->q[n];
+        n           = spline->ind[nn];
+        coefficient = atc->coefficient[n];
 
-        if (qn != 0)
+        if (coefficient != 0)
         {
             idxptr = atc->idx[n];
             norder = nn*order;
@@ -1499,23 +1429,23 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
             switch (order)
             {
                 case 4:
-#ifdef PME_SSE_SPREAD_GATHER
-#ifdef PME_SSE_UNALIGNED
-#define PME_SPREAD_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_SPREAD_SIMD4_ORDER4
 #else
-#define PME_SPREAD_SSE_ALIGNED
+#define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_BSPLINE(4);
 #endif
                     break;
                 case 5:
-#ifdef PME_SSE_SPREAD_GATHER
-#define PME_SPREAD_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_BSPLINE(5);
 #endif
@@ -1528,11 +1458,11 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
     }
 }
 
-static void set_grid_alignment(int *pmegrid_nz, int pme_order)
+static void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
 {
-#ifdef PME_SSE_SPREAD_GATHER
+#ifdef PME_SIMD4_SPREAD_GATHER
     if (pme_order == 5
-#ifndef PME_SSE_UNALIGNED
+#ifndef PME_SIMD4_UNALIGNED
         || pme_order == 4
 #endif
         )
@@ -1545,8 +1475,8 @@ static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 
 static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
 {
-#ifdef PME_SSE_SPREAD_GATHER
-#ifndef PME_SSE_UNALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifndef PME_SIMD4_UNALIGNED
     if (pme_order == 4)
     {
         /* Add extra elements to ensured aligned operations do not go
@@ -1597,7 +1527,7 @@ static void pmegrid_init(pmegrid_t *grid,
     {
         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
         set_gridsize_alignment(&gridsize, pme_order);
-        snew_aligned(grid->grid, gridsize, 16);
+        snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
     }
     else
     {
@@ -1716,7 +1646,7 @@ static void pmegrids_init(pmegrids_t *grids,
         set_gridsize_alignment(&gridsize, pme_order);
         snew_aligned(grids->grid_all,
                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
-                     16);
+                     SIMD4_ALIGNMENT);
 
         for (x = 0; x < grids->nc[XX]; x++)
         {
@@ -1815,6 +1745,8 @@ static void pmegrids_destroy(pmegrids_t *grids)
 
 static void realloc_work(pme_work_t *work, int nkx)
 {
+    int simd_width;
+
     if (nkx > work->nalloc)
     {
         work->nalloc = nkx;
@@ -1825,18 +1757,20 @@ static void realloc_work(pme_work_t *work, int nkx)
         /* Allocate an aligned pointer for SIMD operations, including extra
          * elements at the end for padding.
          */
-#ifdef PME_SIMD
-#define ALIGN_HERE  GMX_SIMD_WIDTH_HERE
+#ifdef PME_SIMD_SOLVE
+        simd_width = GMX_SIMD_REAL_WIDTH;
 #else
-/* We can use any alignment, apart from 0, so we use 4 */
-#define ALIGN_HERE  4
+        /* We can use any alignment, apart from 0, so we use 4 */
+        simd_width = 4;
 #endif
         sfree_aligned(work->denom);
         sfree_aligned(work->tmp1);
+        sfree_aligned(work->tmp2);
         sfree_aligned(work->eterm);
-        snew_aligned(work->denom, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
-        snew_aligned(work->tmp1,  work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
-        snew_aligned(work->eterm, work->nalloc+ALIGN_HERE, ALIGN_HERE*sizeof(real));
+        snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
+        snew_aligned(work->tmp1,  work->nalloc+simd_width, simd_width*sizeof(real));
+        snew_aligned(work->tmp2,  work->nalloc+simd_width, simd_width*sizeof(real));
+        snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
         srenew(work->m2inv, work->nalloc);
     }
 }
@@ -1850,36 +1784,40 @@ static void free_work(pme_work_t *work)
     sfree(work->m2);
     sfree_aligned(work->denom);
     sfree_aligned(work->tmp1);
+    sfree_aligned(work->tmp2);
     sfree_aligned(work->eterm);
     sfree(work->m2inv);
 }
 
 
-#ifdef PME_SIMD
+#if defined PME_SIMD_SOLVE
 /* Calculate exponentials through SIMD */
-inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 {
     {
-        const gmx_mm_pr two = gmx_set1_pr(2.0);
-        gmx_mm_pr f_simd;
-        gmx_mm_pr lu;
-        gmx_mm_pr tmp_d1, d_inv, tmp_r, tmp_e;
+        const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
+        gmx_simd_real_t f_simd;
+        gmx_simd_real_t lu;
+        gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
         int kx;
-        f_simd = gmx_load1_pr(&f);
-        for (kx = 0; kx < end; kx += GMX_SIMD_WIDTH_HERE)
+        f_simd = gmx_simd_set1_r(f);
+        /* We only need to calculate from start. But since start is 0 or 1
+         * and we want to use aligned loads/stores, we always start from 0.
+         */
+        for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
         {
-            tmp_d1   = gmx_load_pr(d_aligned+kx);
-            d_inv    = gmx_inv_pr(tmp_d1);
-            tmp_r    = gmx_load_pr(r_aligned+kx);
-            tmp_r    = gmx_exp_pr(tmp_r);
-            tmp_e    = gmx_mul_pr(f_simd, d_inv);
-            tmp_e    = gmx_mul_pr(tmp_e, tmp_r);
-            gmx_store_pr(e_aligned+kx, tmp_e);
+            tmp_d1   = gmx_simd_load_r(d_aligned+kx);
+            d_inv    = gmx_simd_inv_r(tmp_d1);
+            tmp_r    = gmx_simd_load_r(r_aligned+kx);
+            tmp_r    = gmx_simd_exp_r(tmp_r);
+            tmp_e    = gmx_simd_mul_r(f_simd, d_inv);
+            tmp_e    = gmx_simd_mul_r(tmp_e, tmp_r);
+            gmx_simd_store_r(e_aligned+kx, tmp_e);
         }
     }
 }
 #else
-inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
+gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
 {
     int kx;
     for (kx = start; kx < end; kx++)
@@ -1897,6 +1835,51 @@ inline static void calc_exponentials(int start, int end, real f, real *d, real *
 }
 #endif
 
+#if defined PME_SIMD_SOLVE
+/* Calculate exponentials through SIMD */
+gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
+{
+    gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
+    const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
+    int kx;
+    for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
+    {
+        /* We only need to calculate from start. But since start is 0 or 1
+         * and we want to use aligned loads/stores, we always start from 0.
+         */
+        tmp_d = gmx_simd_load_r(d_aligned+kx);
+        d_inv = gmx_simd_inv_r(tmp_d);
+        gmx_simd_store_r(d_aligned+kx, d_inv);
+        tmp_r = gmx_simd_load_r(r_aligned+kx);
+        tmp_r = gmx_simd_exp_r(tmp_r);
+        gmx_simd_store_r(r_aligned+kx, tmp_r);
+        tmp_mk  = gmx_simd_load_r(factor_aligned+kx);
+        tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
+        gmx_simd_store_r(factor_aligned+kx, tmp_fac);
+    }
+}
+#else
+gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
+{
+    int kx;
+    real mk;
+    for (kx = start; kx < end; kx++)
+    {
+        d[kx] = 1.0/d[kx];
+    }
+
+    for (kx = start; kx < end; kx++)
+    {
+        r[kx] = exp(r[kx]);
+    }
+
+    for (kx = start; kx < end; kx++)
+    {
+        mk       = tmp2[kx];
+        tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk);
+    }
+}
+#endif
 
 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
                          real ewaldcoeff, real vol,
@@ -1930,7 +1913,7 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
     nz = pme->nkz;
 
     /* Dimensions should be identical for A/B grid, so we just use A here */
-    gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
                                       complex_order,
                                       local_ndata,
                                       local_offset,
@@ -1994,6 +1977,7 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
         p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 
         /* We should skip the k-space point (0,0,0) */
+        /* Note that since here x is the minor index, local_offset[XX]=0 */
         if (local_offset[XX] > 0 || ky > 0 || kz > 0)
         {
             kxstart = local_offset[XX];
@@ -2051,7 +2035,7 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
                 m2inv[kx] = 1.0/m2[kx];
             }
 
-            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+            calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
 
             for (kx = kxstart; kx < kxend; kx++, p0++)
             {
@@ -2113,7 +2097,7 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
                 tmp1[kx]  = -factor*m2k;
             }
 
-            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+            calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
 
             for (kx = kxstart; kx < kxend; kx++, p0++)
             {
@@ -2134,39 +2118,368 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
          * experiencing problems on semiisotropic membranes.
          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
          */
-        work->vir[XX][XX] = 0.25*virxx;
-        work->vir[YY][YY] = 0.25*viryy;
-        work->vir[ZZ][ZZ] = 0.25*virzz;
-        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
-        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
-        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
+        work->vir_q[XX][XX] = 0.25*virxx;
+        work->vir_q[YY][YY] = 0.25*viryy;
+        work->vir_q[ZZ][ZZ] = 0.25*virzz;
+        work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
+        work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
+        work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
 
         /* This energy should be corrected for a charged system */
-        work->energy = 0.5*energy;
+        work->energy_q = 0.5*energy;
     }
 
     /* Return the loop count */
     return local_ndata[YY]*local_ndata[XX];
 }
 
-static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
-                             real *mesh_energy, matrix vir)
+static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
+                            real ewaldcoeff, real vol,
+                            gmx_bool bEnerVir, int nthread, int thread)
+{
+    /* do recip sum over local cells in grid */
+    /* y major, z middle, x minor or continuous */
+    int     ig, gcount;
+    int     kx, ky, kz, maxkx, maxky, maxkz;
+    int     nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
+    real    mx, my, mz;
+    real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
+    real    ets2, ets2vf;
+    real    eterm, vterm, d1, d2, energy = 0;
+    real    by, bz;
+    real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
+    real    rxx, ryx, ryy, rzx, rzy, rzz;
+    real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
+    real    mhxk, mhyk, mhzk, m2k;
+    real    mk;
+    pme_work_t *work;
+    real    corner_fac;
+    ivec    complex_order;
+    ivec    local_ndata, local_offset, local_size;
+    nx = pme->nkx;
+    ny = pme->nky;
+    nz = pme->nkz;
+
+    /* Dimensions should be identical for A/B grid, so we just use A here */
+    gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
+                                      complex_order,
+                                      local_ndata,
+                                      local_offset,
+                                      local_size);
+    rxx = pme->recipbox[XX][XX];
+    ryx = pme->recipbox[YY][XX];
+    ryy = pme->recipbox[YY][YY];
+    rzx = pme->recipbox[ZZ][XX];
+    rzy = pme->recipbox[ZZ][YY];
+    rzz = pme->recipbox[ZZ][ZZ];
+
+    maxkx = (nx+1)/2;
+    maxky = (ny+1)/2;
+    maxkz = nz/2+1;
+
+    work  = &pme->work[thread];
+    mhx   = work->mhx;
+    mhy   = work->mhy;
+    mhz   = work->mhz;
+    m2    = work->m2;
+    denom = work->denom;
+    tmp1  = work->tmp1;
+    tmp2  = work->tmp2;
+
+    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
+    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
+
+    for (iyz = iyz0; iyz < iyz1; iyz++)
+    {
+        iy = iyz/local_ndata[ZZ];
+        iz = iyz - iy*local_ndata[ZZ];
+
+        ky = iy + local_offset[YY];
+
+        if (ky < maxky)
+        {
+            my = ky;
+        }
+        else
+        {
+            my = (ky - ny);
+        }
+
+        by = 3.0*vol*pme->bsp_mod[YY][ky]
+            / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
+
+        kz = iz + local_offset[ZZ];
+
+        mz = kz;
+
+        bz = pme->bsp_mod[ZZ][kz];
+
+        /* 0.5 correction for corner points */
+        corner_fac = 1;
+        if (kz == 0 || kz == (nz+1)/2)
+        {
+            corner_fac = 0.5;
+        }
+
+        kxstart = local_offset[XX];
+        kxend   = local_offset[XX] + local_ndata[XX];
+        if (bEnerVir)
+        {
+            /* More expensive inner loop, especially because of the
+             * storage of the mh elements in array's.  Because x is the
+             * minor grid index, all mh elements depend on kx for
+             * triclinic unit cells.
+             */
+
+            /* Two explicit loops to avoid a conditional inside the loop */
+            for (kx = kxstart; kx < maxkx; kx++)
+            {
+                mx = kx;
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                mhx[kx]   = mhxk;
+                mhy[kx]   = mhyk;
+                mhz[kx]   = mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+                tmp2[kx]  = sqrt(factor*m2k);
+            }
+
+            for (kx = maxkx; kx < kxend; kx++)
+            {
+                mx = (kx - nx);
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                mhx[kx]   = mhxk;
+                mhy[kx]   = mhyk;
+                mhz[kx]   = mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+                tmp2[kx]  = sqrt(factor*m2k);
+            }
+
+            calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
+
+            for (kx = kxstart; kx < kxend; kx++)
+            {
+                m2k   = factor*m2[kx];
+                eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
+                          + 2.0*m2k*tmp2[kx]);
+                vterm    = 3.0*(-tmp1[kx] + tmp2[kx]);
+                tmp1[kx] = eterm*denom[kx];
+                tmp2[kx] = vterm*denom[kx];
+            }
+
+            if (!bLB)
+            {
+                t_complex *p0;
+                real       struct2;
+
+                p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                for (kx = kxstart; kx < kxend; kx++, p0++)
+                {
+                    d1      = p0->re;
+                    d2      = p0->im;
+
+                    eterm   = tmp1[kx];
+                    vterm   = tmp2[kx];
+                    p0->re  = d1*eterm;
+                    p0->im  = d2*eterm;
+
+                    struct2 = 2.0*(d1*d1+d2*d2);
+
+                    tmp1[kx] = eterm*struct2;
+                    tmp2[kx] = vterm*struct2;
+                }
+            }
+            else
+            {
+                real *struct2 = denom;
+                real  str2;
+
+                for (kx = kxstart; kx < kxend; kx++)
+                {
+                    struct2[kx] = 0.0;
+                }
+                /* Due to symmetry we only need to calculate 4 of the 7 terms */
+                for (ig = 0; ig <= 3; ++ig)
+                {
+                    t_complex *p0, *p1;
+                    real       scale;
+
+                    p0    = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                    p1    = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                    scale = 2.0*lb_scale_factor_symm[ig];
+                    for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
+                    {
+                        struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
+                    }
+
+                }
+                for (ig = 0; ig <= 6; ++ig)
+                {
+                    t_complex *p0;
+
+                    p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                    for (kx = kxstart; kx < kxend; kx++, p0++)
+                    {
+                        d1     = p0->re;
+                        d2     = p0->im;
+
+                        eterm  = tmp1[kx];
+                        p0->re = d1*eterm;
+                        p0->im = d2*eterm;
+                    }
+                }
+                for (kx = kxstart; kx < kxend; kx++)
+                {
+                    eterm    = tmp1[kx];
+                    vterm    = tmp2[kx];
+                    str2     = struct2[kx];
+                    tmp1[kx] = eterm*str2;
+                    tmp2[kx] = vterm*str2;
+                }
+            }
+
+            for (kx = kxstart; kx < kxend; kx++)
+            {
+                ets2     = corner_fac*tmp1[kx];
+                vterm    = 2.0*factor*tmp2[kx];
+                energy  += ets2;
+                ets2vf   = corner_fac*vterm;
+                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
+                virxy   += ets2vf*mhx[kx]*mhy[kx];
+                virxz   += ets2vf*mhx[kx]*mhz[kx];
+                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
+                viryz   += ets2vf*mhy[kx]*mhz[kx];
+                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
+            }
+        }
+        else
+        {
+            /* We don't need to calculate the energy and the virial.
+             *  In this case the triclinic overhead is small.
+             */
+
+            /* Two explicit loops to avoid a conditional inside the loop */
+
+            for (kx = kxstart; kx < maxkx; kx++)
+            {
+                mx = kx;
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+                tmp2[kx]  = sqrt(factor*m2k);
+            }
+
+            for (kx = maxkx; kx < kxend; kx++)
+            {
+                mx = (kx - nx);
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+                tmp2[kx]  = sqrt(factor*m2k);
+            }
+
+            calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
+
+            for (kx = kxstart; kx < kxend; kx++)
+            {
+                m2k    = factor*m2[kx];
+                eterm  = -((1.0 - 2.0*m2k)*tmp1[kx]
+                           + 2.0*m2k*tmp2[kx]);
+                tmp1[kx] = eterm*denom[kx];
+            }
+            gcount = (bLB ? 7 : 1);
+            for (ig = 0; ig < gcount; ++ig)
+            {
+                t_complex *p0;
+
+                p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                for (kx = kxstart; kx < kxend; kx++, p0++)
+                {
+                    d1      = p0->re;
+                    d2      = p0->im;
+
+                    eterm   = tmp1[kx];
+
+                    p0->re  = d1*eterm;
+                    p0->im  = d2*eterm;
+                }
+            }
+        }
+    }
+    if (bEnerVir)
+    {
+        work->vir_lj[XX][XX] = 0.25*virxx;
+        work->vir_lj[YY][YY] = 0.25*viryy;
+        work->vir_lj[ZZ][ZZ] = 0.25*virzz;
+        work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
+        work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
+        work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
+
+        /* This energy should be corrected for a charged system */
+        work->energy_lj = 0.5*energy;
+    }
+    /* Return the loop count */
+    return local_ndata[YY]*local_ndata[XX];
+}
+
+static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
+                               real *mesh_energy, matrix vir)
+{
+    /* This function sums output over threads and should therefore
+     * only be called after thread synchronization.
+     */
+    int thread;
+
+    *mesh_energy = pme->work[0].energy_q;
+    copy_mat(pme->work[0].vir_q, vir);
+
+    for (thread = 1; thread < nthread; thread++)
+    {
+        *mesh_energy += pme->work[thread].energy_q;
+        m_add(vir, pme->work[thread].vir_q, vir);
+    }
+}
+
+static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
+                                real *mesh_energy, matrix vir)
 {
-    /* This function sums output over threads
-     * and should therefore only be called after thread synchronization.
+    /* This function sums output over threads and should therefore
+     * only be called after thread synchronization.
      */
     int thread;
 
-    *mesh_energy = pme->work[0].energy;
-    copy_mat(pme->work[0].vir, vir);
+    *mesh_energy = pme->work[0].energy_lj;
+    copy_mat(pme->work[0].vir_lj, vir);
 
     for (thread = 1; thread < nthread; thread++)
     {
-        *mesh_energy += pme->work[thread].energy;
-        m_add(vir, pme->work[thread].vir, vir);
+        *mesh_energy += pme->work[thread].energy_lj;
+        m_add(vir, pme->work[thread].vir_lj, vir);
     }
 }
 
+
 #define DO_FSPLINE(order)                      \
     for (ithx = 0; (ithx < order); ithx++)              \
     {                                              \
@@ -2204,7 +2517,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
     int     index_x, index_xy;
     int     nx, ny, nz, pnx, pny, pnz;
     int *   idxptr;
-    real    tx, ty, dx, dy, qn;
+    real    tx, ty, dx, dy, coefficient;
     real    fx, fy, fz, gval;
     real    fxy1, fz1;
     real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
@@ -2214,6 +2527,14 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 
     pme_spline_work_t *work;
 
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+    real           thz_buffer[GMX_SIMD4_WIDTH*3],  *thz_aligned;
+    real           dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
+
+    thz_aligned  = gmx_simd4_align_r(thz_buffer);
+    dthz_aligned = gmx_simd4_align_r(dthz_buffer);
+#endif
+
     work = pme->spline_work;
 
     order = pme->pme_order;
@@ -2239,8 +2560,8 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 
     for (nn = 0; nn < spline->n; nn++)
     {
-        n  = spline->ind[nn];
-        qn = scale*atc->q[n];
+        n           = spline->ind[nn];
+        coefficient = scale*atc->coefficient[n];
 
         if (bClearF)
         {
@@ -2248,7 +2569,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
             atc->f[n][YY] = 0;
             atc->f[n][ZZ] = 0;
         }
-        if (qn != 0)
+        if (coefficient != 0)
         {
             fx     = 0;
             fy     = 0;
@@ -2271,23 +2592,23 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
             switch (order)
             {
                 case 4:
-#ifdef PME_SSE_SPREAD_GATHER
-#ifdef PME_SSE_UNALIGNED
-#define PME_GATHER_F_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_GATHER_F_SIMD4_ORDER4
 #else
-#define PME_GATHER_F_SSE_ALIGNED
+#define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_FSPLINE(4);
 #endif
                     break;
                 case 5:
-#ifdef PME_SSE_SPREAD_GATHER
-#define PME_GATHER_F_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_FSPLINE(5);
 #endif
@@ -2297,9 +2618,9 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
                     break;
             }
 
-            atc->f[n][XX] += -qn*( fx*nx*rxx );
-            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
-            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
+            atc->f[n][XX] += -coefficient*( fx*nx*rxx );
+            atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
+            atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
         }
     }
     /* Since the energy and not forces are interpolated
@@ -2321,7 +2642,7 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
     int     n, ithx, ithy, ithz, i0, j0, k0;
     int     index_x, index_xy;
     int *   idxptr;
-    real    energy, pot, tx, ty, qn, gval;
+    real    energy, pot, tx, ty, coefficient, gval;
     real    *thx, *thy, *thz;
     int     norder;
     int     order;
@@ -2333,9 +2654,9 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
     energy = 0;
     for (n = 0; (n < atc->n); n++)
     {
-        qn      = atc->q[n];
+        coefficient      = atc->coefficient[n];
 
-        if (qn != 0)
+        if (coefficient != 0)
         {
             idxptr = atc->idx[n];
             norder = n*order;
@@ -2369,7 +2690,7 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
                 }
             }
 
-            energy += pot*qn;
+            energy += pot*coefficient;
         }
     }
 
@@ -2431,8 +2752,8 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
     }
 
 void make_bsplines(splinevec theta, splinevec dtheta, int order,
-                   rvec fractx[], int nr, int ind[], real charge[],
-                   gmx_bool bFreeEnergy)
+                   rvec fractx[], int nr, int ind[], real coefficient[],
+                   gmx_bool bDoSplines)
 {
     /* construct splines for local atoms */
     int  i, ii;
@@ -2440,12 +2761,12 @@ void make_bsplines(splinevec theta, splinevec dtheta, int order,
 
     for (i = 0; i < nr; i++)
     {
-        /* With free energy we do not use the charge check.
+        /* With free energy we do not use the coefficient check.
          * In most cases this will be more efficient than calling make_bsplines
-         * twice, since usually more than half the particles have charges.
+         * twice, since usually more than half the particles have non-zero coefficients.
          */
         ii = ind[i];
-        if (bFreeEnergy || charge[ii] != 0.0)
+        if (bDoSplines || coefficient[ii] != 0.0)
         {
             xptr = fractx[ii];
             switch (order)
@@ -2653,7 +2974,7 @@ static void setup_coordinate_communication(pme_atomcomm_t *atc)
 
 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
 {
-    int thread;
+    int thread, i;
 
     if (NULL != log)
     {
@@ -2664,19 +2985,17 @@ int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
     sfree((*pmedata)->nny);
     sfree((*pmedata)->nnz);
 
-    pmegrids_destroy(&(*pmedata)->pmegridA);
-
-    sfree((*pmedata)->fftgridA);
-    sfree((*pmedata)->cfftgridA);
-    gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
-
-    if ((*pmedata)->pmegridB.grid.grid != NULL)
+    for (i = 0; i < (*pmedata)->ngrids; ++i)
     {
-        pmegrids_destroy(&(*pmedata)->pmegridB);
-        sfree((*pmedata)->fftgridB);
-        sfree((*pmedata)->cfftgridB);
-        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
+        pmegrids_destroy(&(*pmedata)->pmegrid[i]);
+        sfree((*pmedata)->fftgrid[i]);
+        sfree((*pmedata)->cfftgrid[i]);
+        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
     }
+
+    sfree((*pmedata)->lb_buf1);
+    sfree((*pmedata)->lb_buf2);
+
     for (thread = 0; thread < (*pmedata)->nthread; thread++)
     {
         free_work(&(*pmedata)->work[thread]);
@@ -2739,7 +3058,6 @@ static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
 
     if (atc->nslab > 1)
     {
-        /* These three allocations are not required for particle decomp. */
         snew(atc->node_dest, atc->nslab);
         snew(atc->node_src, atc->nslab);
         setup_coordinate_communication(atc);
@@ -2815,8 +3133,9 @@ init_overlap_comm(pme_overlap_t *  ol,
     {
         /* s2g0 the local interpolation grid start.
          * s2g1 the local interpolation grid end.
-         * Because grid overlap communication only goes forward,
-         * the grid the slabs for fft's should be rounded down.
+         * Since in calc_pidx we divide particles, and not grid lines,
+         * spatially uniform along dimension x or y, we need to round
+         * s2g0 down and s2g1 up.
          */
         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
@@ -2957,33 +3276,36 @@ make_gridindex5_to_localindex(int n, int local_start, int local_range,
     *fraction_shift  = fsh;
 }
 
-static pme_spline_work_t *make_pme_spline_work(int order)
+static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
 {
     pme_spline_work_t *work;
 
-#ifdef PME_SSE_SPREAD_GATHER
-    float  tmp[8];
-    __m128 zero_SSE;
-    int    of, i;
+#ifdef PME_SIMD4_SPREAD_GATHER
+    real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
+    gmx_simd4_real_t zero_S;
+    gmx_simd4_real_t real_mask_S0, real_mask_S1;
+    int              of, i;
 
-    snew_aligned(work, 1, 16);
+    snew_aligned(work, 1, SIMD4_ALIGNMENT);
 
-    zero_SSE = _mm_setzero_ps();
+    tmp_aligned = gmx_simd4_align_r(tmp);
+
+    zero_S = gmx_simd4_setzero_r();
 
     /* Generate bit masks to mask out the unused grid entries,
      * as we only operate on order of the 8 grid entries that are
-     * load into 2 SSE float registers.
+     * load into 2 SIMD registers.
      */
-    for (of = 0; of < 8-(order-1); of++)
+    for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
     {
-        for (i = 0; i < 8; i++)
+        for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
         {
-            tmp[i] = (i >= of && i < of+order ? 1 : 0);
+            tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
         }
-        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
-        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
-        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
-        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
+        real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
+        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
+        work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
+        work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
     }
 #else
     work = NULL;
@@ -3035,7 +3357,7 @@ void gmx_pme_check_restrictions(int pme_order,
             *bValidSettings = FALSE;
             return;
         }
-        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+        gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
                   nkx/(double)nnodes_major, pme_order);
     }
 
@@ -3053,13 +3375,14 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                  int                 nnodes_minor,
                  t_inputrec *        ir,
                  int                 homenr,
-                 gmx_bool            bFreeEnergy,
+                 gmx_bool            bFreeEnergy_q,
+                 gmx_bool            bFreeEnergy_lj,
                  gmx_bool            bReproducible,
                  int                 nthread)
 {
     gmx_pme_t pme = NULL;
 
-    int  use_threads, sum_use_threads;
+    int  use_threads, sum_use_threads, i;
     ivec ndata;
 
     if (debug)
@@ -3068,11 +3391,9 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     }
     snew(pme, 1);
 
-    pme->redist_init         = FALSE;
     pme->sum_qgrid_tmp       = NULL;
     pme->sum_qgrid_dd_tmp    = NULL;
     pme->buf_nalloc          = 0;
-    pme->redist_buf_nalloc   = 0;
 
     pme->nnodes              = 1;
     pme->bPPnode             = TRUE;
@@ -3089,7 +3410,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
         if (pme->nnodes != nnodes_major*nnodes_minor)
         {
-            gmx_incons("PME node count mismatch");
+            gmx_incons("PME rank count mismatch");
         }
     }
     else
@@ -3138,7 +3459,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         {
             if (pme->nnodes % nnodes_major != 0)
             {
-                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+                gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
             }
             pme->ndecompdim = 2;
 
@@ -3179,14 +3500,21 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
     }
 
-    pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
+    pme->bFEP_q      = ((ir->efep != efepNO) && bFreeEnergy_q);
+    pme->bFEP_lj     = ((ir->efep != efepNO) && bFreeEnergy_lj);
+    pme->bFEP        = (pme->bFEP_q || pme->bFEP_lj);
     pme->nkx         = ir->nkx;
     pme->nky         = ir->nky;
     pme->nkz         = ir->nkz;
     pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
     pme->pme_order   = ir->pme_order;
+
+    /* Always constant electrostatics coefficients */
     pme->epsilon_r   = ir->epsilon_r;
 
+    /* Always constant LJ coefficients */
+    pme->ljpme_combination_rule = ir->ljpme_combination_rule;
+
     /* If we violate restrictions, generate a fatal error here */
     gmx_pme_check_restrictions(pme->pme_order,
                                pme->nkx, pme->nky, pme->nkz,
@@ -3205,10 +3533,10 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         MPI_Type_commit(&(pme->rvec_mpi));
 #endif
 
-        /* Note that the charge spreading and force gathering, which usually
+        /* Note that the coefficient spreading and force gathering, which usually
          * takes about the same amount of time as FFT+solve_pme,
          * is always fully load balanced
-         * (unless the charge distribution is inhomogeneous).
+         * (unless the coefficient distribution is inhomogeneous).
          */
 
         imbal = pme_load_imbalance(pme);
@@ -3218,8 +3546,8 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                     "\n"
                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
                     "      For optimal PME load balancing\n"
-                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
-                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
+                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
                     "\n",
                     (int)((imbal-1)*100 + 0.5),
                     pme->nkx, pme->nky, pme->nnodes_major,
@@ -3292,48 +3620,49 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                                   pme->pmegrid_nz_base,
                                   &pme->nnz, &pme->fshz);
 
-    pmegrids_init(&pme->pmegridA,
-                  pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
-                  pme->pmegrid_nz_base,
-                  pme->pme_order,
-                  pme->bUseThreads,
-                  pme->nthread,
-                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
-                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
-
     pme->spline_work = make_pme_spline_work(pme->pme_order);
 
-    ndata[0] = pme->nkx;
-    ndata[1] = pme->nky;
-    ndata[2] = pme->nkz;
-
-    /* This routine will allocate the grid data to fit the FFTs */
-    gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
-                            &pme->fftgridA, &pme->cfftgridA,
-                            pme->mpi_comm_d,
-                            bReproducible, pme->nthread);
-
-    if (bFreeEnergy)
+    ndata[0]    = pme->nkx;
+    ndata[1]    = pme->nky;
+    ndata[2]    = pme->nkz;
+    /* It doesn't matter if we allocate too many grids here,
+     * we only allocate and use the ones we need.
+     */
+    if (EVDW_PME(ir->vdwtype))
     {
-        pmegrids_init(&pme->pmegridB,
-                      pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
-                      pme->pmegrid_nz_base,
-                      pme->pme_order,
-                      pme->bUseThreads,
-                      pme->nthread,
-                      pme->nkx % pme->nnodes_major != 0,
-                      pme->nky % pme->nnodes_minor != 0);
-
-        gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
-                                &pme->fftgridB, &pme->cfftgridB,
-                                pme->mpi_comm_d,
-                                bReproducible, pme->nthread);
+        pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
     }
     else
     {
-        pme->pmegridB.grid.grid = NULL;
-        pme->fftgridB           = NULL;
-        pme->cfftgridB          = NULL;
+        pme->ngrids = DO_Q;
+    }
+    snew(pme->fftgrid, pme->ngrids);
+    snew(pme->cfftgrid, pme->ngrids);
+    snew(pme->pfft_setup, pme->ngrids);
+
+    for (i = 0; i < pme->ngrids; ++i)
+    {
+        if ((i <  DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
+                                                       bFreeEnergy_q)) ||
+            (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
+                                                    bFreeEnergy_lj ||
+                                                    ir->ljpme_combination_rule == eljpmeLB)))
+        {
+            pmegrids_init(&pme->pmegrid[i],
+                          pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
+                          pme->pmegrid_nz_base,
+                          pme->pme_order,
+                          pme->bUseThreads,
+                          pme->nthread,
+                          pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
+                          pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
+            /* This routine will allocate the grid data to fit the FFTs */
+            gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
+                                    &pme->fftgrid[i], &pme->cfftgrid[i],
+                                    pme->mpi_comm_d,
+                                    bReproducible, pme->nthread);
+
+        }
     }
 
     if (!pme->bP3M)
@@ -3360,6 +3689,10 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         pme_realloc_atomcomm_things(&pme->atc[0]);
     }
 
+    pme->lb_buf1       = NULL;
+    pme->lb_buf2       = NULL;
+    pme->lb_buf_nalloc = 0;
+
     {
         int thread;
 
@@ -3427,12 +3760,12 @@ int gmx_pme_reinit(gmx_pme_t *         pmedata,
     }
 
     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
-                       &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
+                       &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
 
     if (ret == 0)
     {
         /* We can easily reuse the allocated pme grids in pme_src */
-        reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
+        reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
         /* We would like to reuse the fft grids, but that's harder */
     }
 
@@ -3440,8 +3773,8 @@ int gmx_pme_reinit(gmx_pme_t *         pmedata,
 }
 
 
-static void copy_local_grid(gmx_pme_t pme,
-                            pmegrids_t *pmegrids, int thread, real *fftgrid)
+static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
+                            int grid_index, int thread, real *fftgrid)
 {
     ivec local_fft_ndata, local_fft_offset, local_fft_size;
     int  fft_my, fft_mz;
@@ -3452,7 +3785,7 @@ static void copy_local_grid(gmx_pme_t pme,
     pmegrid_t *pmegrid;
     real *grid_th;
 
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -3496,14 +3829,15 @@ static void copy_local_grid(gmx_pme_t pme,
 static void
 reduce_threadgrid_overlap(gmx_pme_t pme,
                           const pmegrids_t *pmegrids, int thread,
-                          real *fftgrid, real *commbuf_x, real *commbuf_y)
+                          real *fftgrid, real *commbuf_x, real *commbuf_y,
+                          int grid_index)
 {
     ivec local_fft_ndata, local_fft_offset, local_fft_size;
     int  fft_nx, fft_ny, fft_nz;
     int  fft_my, fft_mz;
     int  buf_my = -1;
     int  nsx, nsy, nsz;
-    ivec ne;
+    ivec localcopy_end;
     int  offx, offy, offz, x, y, z, i0, i0t;
     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
@@ -3514,7 +3848,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
     const real *grid_th;
     real *commbuf = NULL;
 
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -3539,8 +3873,14 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 
     for (d = 0; d < DIM; d++)
     {
-        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
-                    local_fft_ndata[d]);
+        /* Determine up to where our thread needs to copy from the
+         * thread-local charge spreading grid to the rank-local FFT grid.
+         * This is up to our spreading grid end minus order-1 and
+         * not beyond the local FFT grid.
+         */
+        localcopy_end[d] =
+            min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+                local_fft_ndata[d]);
     }
 
     offx = pmegrid->offset[XX];
@@ -3555,7 +3895,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
     /* Now loop over all the thread data blocks that contribute
      * to the grid region we (our thread) are operating on.
      */
-    /* Note that ffy_nx/y is equal to the number of grid points
+    /* Note that fft_nx/y is equal to the number of grid points
      * between the first point of our node grid and the one of the next node.
      */
     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
@@ -3571,12 +3911,15 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
         }
         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
         ox       += pmegrid_g->offset[XX];
+        /* Determine the end of our part of the source grid */
         if (!bCommX)
         {
-            tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+            /* Use our thread local source grid and target grid part */
+            tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
         }
         else
         {
+            /* Use our thread local source grid and the spreading range */
             tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
         }
 
@@ -3593,12 +3936,15 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
             }
             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
             oy       += pmegrid_g->offset[YY];
+            /* Determine the end of our part of the source grid */
             if (!bCommY)
             {
-                ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+                /* Use our thread local source grid and target grid part */
+                ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
             }
             else
             {
+                /* Use our thread local source grid and the spreading range */
                 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
             }
 
@@ -3613,7 +3959,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                 }
                 pmegrid_g = &pmegrids->grid_th[fz];
                 oz       += pmegrid_g->offset[ZZ];
-                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+                tz1       = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
 
                 if (sx == 0 && sy == 0 && sz == 0)
                 {
@@ -3669,7 +4015,13 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                     if (bCommY)
                     {
                         commbuf = commbuf_y;
-                        buf_my  = ty1 - offy;
+                        /* The y-size of the communication buffer is set by
+                         * the overlap of the grid part of our local slab
+                         * with the part starting at the next slab.
+                         */
+                        buf_my  =
+                            pme->overlap[1].s2g1[pme->nodeid_minor] -
+                            pme->overlap[1].s2g0[pme->nodeid_minor+1];
                         if (bCommX)
                         {
                             /* We index commbuf modulo the local grid size */
@@ -3724,7 +4076,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 }
 
 
-static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
+static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
 {
     ivec local_fft_ndata, local_fft_offset, local_fft_size;
     pme_overlap_t *overlap;
@@ -3739,13 +4091,13 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
     int  x, y, z, indg, indb;
 
     /* Note that this routine is only used for forward communication.
-     * Since the force gathering, unlike the charge spreading,
+     * Since the force gathering, unlike the coefficient spreading,
      * can be trivially parallelized over the particles,
      * the backwards process is much simpler and can use the "old"
      * communication setup.
      */
 
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -3782,6 +4134,12 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
             recvptr = overlap->recvbuf;
 
+            if (debug != NULL)
+            {
+                fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+                        local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+            }
+
 #ifdef GMX_MPI
             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
                          send_id, ipulse,
@@ -3848,7 +4206,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
 
         if (debug != NULL)
         {
-            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
         }
 
@@ -3879,7 +4237,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
 static void spread_on_grid(gmx_pme_t pme,
                            pme_atomcomm_t *atc, pmegrids_t *grids,
                            gmx_bool bCalcSplines, gmx_bool bSpread,
-                           real *fftgrid)
+                           real *fftgrid, gmx_bool bDoSplines, int grid_index)
 {
     int nthread, thread;
 #ifdef PME_TIME_THREADS
@@ -3908,7 +4266,7 @@ static void spread_on_grid(gmx_pme_t pme,
             /* Compute fftgrid index for all atoms,
              * with help of some extra variables.
              */
-            calc_interpolation_idx(pme, atc, start, end, thread);
+            calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
         }
     }
 #ifdef PME_TIME_THREADS
@@ -3943,7 +4301,7 @@ static void spread_on_grid(gmx_pme_t pme,
 
             if (grids->nthread == 1)
             {
-                /* One thread, we operate on all charges */
+                /* One thread, we operate on all coefficients */
                 spline->n = atc->n;
             }
             else
@@ -3958,7 +4316,7 @@ static void spread_on_grid(gmx_pme_t pme,
         if (bCalcSplines)
         {
             make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
-                          atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
+                          atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
         }
 
         if (bSpread)
@@ -3967,11 +4325,11 @@ static void spread_on_grid(gmx_pme_t pme,
 #ifdef PME_TIME_SPREAD
             ct1a = omp_cyc_start();
 #endif
-            spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
+            spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
 
             if (pme->bUseThreads)
             {
-                copy_local_grid(pme, grids, thread, fftgrid);
+                copy_local_grid(pme, grids, grid_index, thread, fftgrid);
             }
 #ifdef PME_TIME_SPREAD
             ct1a          = omp_cyc_end(ct1a);
@@ -3995,7 +4353,8 @@ static void spread_on_grid(gmx_pme_t pme,
             reduce_threadgrid_overlap(pme, grids, thread,
                                       fftgrid,
                                       pme->overlap[0].sendbuf,
-                                      pme->overlap[1].sendbuf);
+                                      pme->overlap[1].sendbuf,
+                                      grid_index);
         }
 #ifdef PME_TIME_THREADS
         c3   = omp_cyc_end(c3);
@@ -4008,7 +4367,7 @@ static void spread_on_grid(gmx_pme_t pme,
              * For this communication call we need to check pme->bUseThreads
              * to have all ranks communicate here, regardless of pme->nthread.
              */
-            sum_fftgrid_dd(pme, fftgrid);
+            sum_fftgrid_dd(pme, fftgrid, grid_index);
         }
     }
 
@@ -4053,7 +4412,7 @@ static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
 {
     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -4080,7 +4439,7 @@ void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
     {
         gmx_incons("gmx_pme_calc_energy called in parallel");
     }
-    if (pme->bFEP > 1)
+    if (pme->bFEP_q > 1)
     {
         gmx_incons("gmx_pme_calc_energy with free energy");
     }
@@ -4096,23 +4455,23 @@ void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
     atc->pme_order = pme->pme_order;
     atc->n         = n;
     pme_realloc_atomcomm_things(atc);
-    atc->x         = x;
-    atc->q         = q;
+    atc->x           = x;
+    atc->coefficient = q;
 
     /* We only use the A-charges grid */
-    grid = &pme->pmegridA;
+    grid = &pme->pmegrid[PME_GRID_QA];
 
     /* Only calculate the spline coefficients, don't actually spread */
-    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
+    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
 
     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 }
 
 
 static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
-                                   gmx_runtime_t *runtime,
+                                   gmx_walltime_accounting_t walltime_accounting,
                                    t_nrnb *nrnb, t_inputrec *ir,
-                                   gmx_large_int_t step)
+                                   gmx_int64_t step)
 {
     /* Reset all the counters related to performance over the run */
     wallcycle_stop(wcycle, ewcRUN);
@@ -4125,7 +4484,7 @@ static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
     }
     ir->init_step = step;
     wallcycle_start(wcycle, ewcRUN);
-    runtime_start(runtime);
+    walltime_accounting_start(walltime_accounting);
 }
 
 
@@ -4162,12 +4521,11 @@ static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
     *pme_ret = (*pmedata)[ind];
 }
 
-
 int gmx_pmeonly(gmx_pme_t pme,
-                t_commrec *cr,    t_nrnb *nrnb,
+                t_commrec *cr,    t_nrnb *mynrnb,
                 gmx_wallcycle_t wcycle,
-                gmx_runtime_t *runtime,
-                real ewaldcoeff,
+                gmx_walltime_accounting_t walltime_accounting,
+                real ewaldcoeff_q, real ewaldcoeff_lj,
                 t_inputrec *ir)
 {
     int npmedata;
@@ -4178,14 +4536,18 @@ int gmx_pmeonly(gmx_pme_t pme,
     matrix box;
     rvec *x_pp      = NULL, *f_pp = NULL;
     real *chargeA   = NULL, *chargeB = NULL;
-    real lambda     = 0;
+    real *c6A       = NULL, *c6B = NULL;
+    real *sigmaA    = NULL, *sigmaB = NULL;
+    real lambda_q   = 0;
+    real lambda_lj  = 0;
     int  maxshift_x = 0, maxshift_y = 0;
-    real energy, dvdlambda;
-    matrix vir;
+    real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
+    matrix vir_q, vir_lj;
     float cycles;
     int  count;
     gmx_bool bEnerVir;
-    gmx_large_int_t step, step_rel;
+    int pme_flags;
+    gmx_int64_t step, step_rel;
     ivec grid_switch;
 
     /* This data will only use with PME tuning, i.e. switching PME grids */
@@ -4195,7 +4557,7 @@ int gmx_pmeonly(gmx_pme_t pme,
 
     pme_pp = gmx_pme_pp_init(cr);
 
-    init_nrnb(nrnb);
+    init_nrnb(mynrnb);
 
     count = 0;
     do /****** this is a quasi-loop over time steps! */
@@ -4204,14 +4566,19 @@ int gmx_pmeonly(gmx_pme_t pme,
         do
         {
             /* Domain decomposition */
-            ret = gmx_pme_recv_q_x(pme_pp,
-                                   &natoms,
-                                   &chargeA, &chargeB, box, &x_pp, &f_pp,
-                                   &maxshift_x, &maxshift_y,
-                                   &pme->bFEP, &lambda,
-                                   &bEnerVir,
-                                   &step,
-                                   grid_switch, &ewaldcoeff);
+            ret = gmx_pme_recv_coeffs_coords(pme_pp,
+                                             &natoms,
+                                             &chargeA, &chargeB,
+                                             &c6A, &c6B,
+                                             &sigmaA, &sigmaB,
+                                             box, &x_pp, &f_pp,
+                                             &maxshift_x, &maxshift_y,
+                                             &pme->bFEP_q, &pme->bFEP_lj,
+                                             &lambda_q, &lambda_lj,
+                                             &bEnerVir,
+                                             &pme_flags,
+                                             &step,
+                                             grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
 
             if (ret == pmerecvqxSWITCHGRID)
             {
@@ -4222,7 +4589,7 @@ int gmx_pmeonly(gmx_pme_t pme,
             if (ret == pmerecvqxRESETCOUNTERS)
             {
                 /* Reset the cycle and flop counters */
-                reset_pmeonly_counters(wcycle, runtime, nrnb, ir, step);
+                reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
             }
         }
         while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
@@ -4238,45 +4605,125 @@ int gmx_pmeonly(gmx_pme_t pme,
         if (count == 0)
         {
             wallcycle_start(wcycle, ewcRUN);
-            runtime_start(runtime);
+            walltime_accounting_start(walltime_accounting);
         }
 
         wallcycle_start(wcycle, ewcPMEMESH);
 
-        dvdlambda = 0;
-        clear_mat(vir);
-        gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
-                   cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
-                   &energy, lambda, &dvdlambda,
-                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
+        dvdlambda_q  = 0;
+        dvdlambda_lj = 0;
+        clear_mat(vir_q);
+        clear_mat(vir_lj);
+
+        gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
+                   chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
+                   cr, maxshift_x, maxshift_y, mynrnb, wcycle,
+                   vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
+                   &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
+                   pme_flags | GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 
         cycles = wallcycle_stop(wcycle, ewcPMEMESH);
 
         gmx_pme_send_force_vir_ener(pme_pp,
-                                    f_pp, vir, energy, dvdlambda,
-                                    cycles);
+                                    f_pp, vir_q, energy_q, vir_lj, energy_lj,
+                                    dvdlambda_q, dvdlambda_lj, cycles);
 
         count++;
     } /***** end of quasi-loop, we stop with the break above */
     while (TRUE);
 
-    runtime_end(runtime);
+    walltime_accounting_end(walltime_accounting);
 
     return 0;
 }
 
+static void
+calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
+{
+    int  i;
+
+    for (i = 0; i < pme->atc[0].n; ++i)
+    {
+        real sigma4;
+
+        sigma4                     = local_sigma[i];
+        sigma4                     = sigma4*sigma4;
+        sigma4                     = sigma4*sigma4;
+        pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
+    }
+}
+
+static void
+calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
+{
+    int  i;
+
+    for (i = 0; i < pme->atc[0].n; ++i)
+    {
+        pme->atc[0].coefficient[i] *= local_sigma[i];
+    }
+}
+
+static void
+do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
+                     gmx_bool bFirst, rvec x[], real *data)
+{
+    int      d;
+    pme_atomcomm_t *atc;
+    atc = &pme->atc[0];
+
+    for (d = pme->ndecompdim - 1; d >= 0; d--)
+    {
+        int             n_d;
+        rvec           *x_d;
+        real           *param_d;
+
+        if (d == pme->ndecompdim - 1)
+        {
+            n_d     = homenr;
+            x_d     = x + start;
+            param_d = data;
+        }
+        else
+        {
+            n_d     = pme->atc[d + 1].n;
+            x_d     = atc->x;
+            param_d = atc->coefficient;
+        }
+        atc      = &pme->atc[d];
+        atc->npd = n_d;
+        if (atc->npd > atc->pd_nalloc)
+        {
+            atc->pd_nalloc = over_alloc_dd(atc->npd);
+            srenew(atc->pd, atc->pd_nalloc);
+        }
+        pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
+        where();
+        /* Redistribute x (only once) and qA/c6A or qB/c6B */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
+        }
+    }
+}
+
 int gmx_pme_do(gmx_pme_t pme,
                int start,       int homenr,
                rvec x[],        rvec f[],
                real *chargeA,   real *chargeB,
+               real *c6A,       real *c6B,
+               real *sigmaA,    real *sigmaB,
                matrix box, t_commrec *cr,
                int  maxshift_x, int maxshift_y,
                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
-               matrix vir,      real ewaldcoeff,
-               real *energy,    real lambda,
-               real *dvdlambda, int flags)
+               matrix vir_q,      real ewaldcoeff_q,
+               matrix vir_lj,   real ewaldcoeff_lj,
+               real *energy_q,  real *energy_lj,
+               real lambda_q, real lambda_lj,
+               real *dvdlambda_q, real *dvdlambda_lj,
+               int flags)
 {
-    int     q, d, i, j, ntot, npme;
+    int     d, i, j, k, ntot, npme, grid_index, max_grid_index;
     int     nx, ny, nz;
     int     n_d, local_ny;
     pme_atomcomm_t *atc = NULL;
@@ -4284,14 +4731,18 @@ int gmx_pme_do(gmx_pme_t pme,
     real    *grid       = NULL;
     real    *ptr;
     rvec    *x_d, *f_d;
-    real    *charge = NULL, *q_d;
-    real    energy_AB[2];
-    matrix  vir_AB[2];
+    real    *coefficient = NULL;
+    real    energy_AB[4];
+    matrix  vir_AB[4];
+    real    scale, lambda;
     gmx_bool bClearF;
     gmx_parallel_3dfft_t pfft_setup;
     real *  fftgrid;
     t_complex * cfftgrid;
     int     thread;
+    gmx_bool bFirst, bDoSplines;
+    int fep_state;
+    int fep_states_lj           = pme->bFEP_lj ? 2 : 1;
     const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
     const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
 
@@ -4307,37 +4758,81 @@ int gmx_pme_do(gmx_pme_t pme,
             atc->pd_nalloc = over_alloc_dd(atc->npd);
             srenew(atc->pd, atc->pd_nalloc);
         }
-        atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+        for (d = pme->ndecompdim-1; d >= 0; d--)
+        {
+            atc           = &pme->atc[d];
+            atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+        }
     }
     else
     {
+        atc = &pme->atc[0];
         /* This could be necessary for TPI */
         pme->atc[0].n = homenr;
+        if (DOMAINDECOMP(cr))
+        {
+            pme_realloc_atomcomm_things(atc);
+        }
+        atc->x = x;
+        atc->f = f;
     }
 
-    for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
+    m_inv_ur0(box, pme->recipbox);
+    bFirst = TRUE;
+
+    /* For simplicity, we construct the splines for all particles if
+     * more than one PME calculations is needed. Some optimization
+     * could be done by keeping track of which atoms have splines
+     * constructed, and construct new splines on each pass for atoms
+     * that don't yet have them.
+     */
+
+    bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
+
+    /* We need a maximum of four separate PME calculations:
+     * grid_index=0: Coulomb PME with charges from state A
+     * grid_index=1: Coulomb PME with charges from state B
+     * grid_index=2: LJ PME with C6 from state A
+     * grid_index=3: LJ PME with C6 from state B
+     * For Lorentz-Berthelot combination rules, a separate loop is used to
+     * calculate all the terms
+     */
+
+    /* If we are doing LJ-PME with LB, we only do Q here */
+    max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
+
+    for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
     {
-        if (q == 0)
+        /* Check if we should do calculations at this grid_index
+         * If grid_index is odd we should be doing FEP
+         * If grid_index < 2 we should be doing electrostatic PME
+         * If grid_index >= 2 we should be doing LJ-PME
+         */
+        if ((grid_index <  DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
+                                    (grid_index == 1 && !pme->bFEP_q))) ||
+            (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
+                                    (grid_index == 3 && !pme->bFEP_lj))))
         {
-            pmegrid    = &pme->pmegridA;
-            fftgrid    = pme->fftgridA;
-            cfftgrid   = pme->cfftgridA;
-            pfft_setup = pme->pfft_setupA;
-            charge     = chargeA+start;
+            continue;
         }
-        else
+        /* Unpack structure */
+        pmegrid    = &pme->pmegrid[grid_index];
+        fftgrid    = pme->fftgrid[grid_index];
+        cfftgrid   = pme->cfftgrid[grid_index];
+        pfft_setup = pme->pfft_setup[grid_index];
+        switch (grid_index)
         {
-            pmegrid    = &pme->pmegridB;
-            fftgrid    = pme->fftgridB;
-            cfftgrid   = pme->cfftgridB;
-            pfft_setup = pme->pfft_setupB;
-            charge     = chargeB+start;
+            case 0: coefficient = chargeA + start; break;
+            case 1: coefficient = chargeB + start; break;
+            case 2: coefficient = c6A + start; break;
+            case 3: coefficient = c6B + start; break;
         }
+
         grid = pmegrid->grid.grid;
-        /* Unpack structure */
+
         if (debug)
         {
-            fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+            fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
                     cr->nnodes, cr->nodeid);
             fprintf(debug, "Grid = %p\n", (void*)grid);
             if (grid == NULL)
@@ -4347,58 +4842,14 @@ int gmx_pme_do(gmx_pme_t pme,
         }
         where();
 
-        m_inv_ur0(box, pme->recipbox);
-
         if (pme->nnodes == 1)
         {
-            atc = &pme->atc[0];
-            if (DOMAINDECOMP(cr))
-            {
-                atc->n = homenr;
-                pme_realloc_atomcomm_things(atc);
-            }
-            atc->x = x;
-            atc->q = charge;
-            atc->f = f;
+            atc->coefficient = coefficient;
         }
         else
         {
             wallcycle_start(wcycle, ewcPME_REDISTXF);
-            for (d = pme->ndecompdim-1; d >= 0; d--)
-            {
-                if (d == pme->ndecompdim-1)
-                {
-                    n_d = homenr;
-                    x_d = x + start;
-                    q_d = charge;
-                }
-                else
-                {
-                    n_d = pme->atc[d+1].n;
-                    x_d = atc->x;
-                    q_d = atc->q;
-                }
-                atc      = &pme->atc[d];
-                atc->npd = n_d;
-                if (atc->npd > atc->pd_nalloc)
-                {
-                    atc->pd_nalloc = over_alloc_dd(atc->npd);
-                    srenew(atc->pd, atc->pd_nalloc);
-                }
-                atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
-                pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
-                where();
-
-                /* Redistribute x (only once) and qA or qB */
-                if (DOMAINDECOMP(cr))
-                {
-                    dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
-                }
-                else
-                {
-                    pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
-                }
-            }
+            do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
             where();
 
             wallcycle_stop(wcycle, ewcPME_REDISTXF);
@@ -4406,22 +4857,22 @@ int gmx_pme_do(gmx_pme_t pme,
 
         if (debug)
         {
-            fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+            fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
                     cr->nodeid, atc->n);
         }
 
-        if (flags & GMX_PME_SPREAD_Q)
+        if (flags & GMX_PME_SPREAD)
         {
             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 
-            /* Spread the charges on a grid */
-            spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
+            /* Spread the coefficients on a grid */
+            spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
 
-            if (q == 0)
+            if (bFirst)
             {
                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
             }
-            inc_nrnb(nrnb, eNR_SPREADQBSP,
+            inc_nrnb(nrnb, eNR_SPREADBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 
             if (!pme->bUseThreads)
@@ -4432,12 +4883,12 @@ int gmx_pme_do(gmx_pme_t pme,
 #ifdef GMX_MPI
                 if (pme->nnodes > 1)
                 {
-                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
+                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
                     where();
                 }
 #endif
 
-                copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
+                copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
             }
 
             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
@@ -4472,16 +4923,28 @@ int gmx_pme_do(gmx_pme_t pme,
                 /* solve in k-space for our local cells */
                 if (thread == 0)
                 {
-                    wallcycle_start(wcycle, ewcPME_SOLVE);
+                    wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+                }
+                if (grid_index < DO_Q)
+                {
+                    loop_count =
+                        solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
+                                      box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                      bCalcEnerVir,
+                                      pme->nthread, thread);
                 }
-                loop_count =
-                    solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
-                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
-                                  bCalcEnerVir,
-                                  pme->nthread, thread);
+                else
+                {
+                    loop_count =
+                        solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
+                                         box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                         bCalcEnerVir,
+                                         pme->nthread, thread);
+                }
+
                 if (thread == 0)
                 {
-                    wallcycle_stop(wcycle, ewcPME_SOLVE);
+                    wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
                     where();
                     inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
                 }
@@ -4510,10 +4973,13 @@ int gmx_pme_do(gmx_pme_t pme,
                         inc_nrnb(nrnb, eNR_FFT, 2*npme);
                     }
 
+                    /* Note: this wallcycle region is closed below
+                       outside an OpenMP region, so take care if
+                       refactoring code here. */
                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
                 }
 
-                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
+                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
             }
         }
         /* End of thread parallel section.
@@ -4526,7 +4992,7 @@ int gmx_pme_do(gmx_pme_t pme,
 #ifdef GMX_MPI
             if (pme->nnodes > 1)
             {
-                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
+                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
             }
 #endif
             where();
@@ -4541,19 +5007,22 @@ int gmx_pme_do(gmx_pme_t pme,
              * atc->f is the actual force array, not a buffer,
              * therefore we should not clear it.
              */
-            bClearF = (q == 0 && PAR(cr));
+            lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
+            bClearF = (bFirst && PAR(cr));
 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
             for (thread = 0; thread < pme->nthread; thread++)
             {
                 gather_f_bsplines(pme, grid, bClearF, atc,
                                   &atc->spline[thread],
-                                  pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
+                                  pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
             }
 
             where();
 
             inc_nrnb(nrnb, eNR_GATHERFBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+            /* Note: this wallcycle region is opened above inside an OpenMP
+               region, so take care if refactoring code here. */
             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
         }
 
@@ -4562,9 +5031,269 @@ int gmx_pme_do(gmx_pme_t pme,
             /* This should only be called on the master thread
              * and after the threads have synchronized.
              */
-            get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+            if (grid_index < 2)
+            {
+                get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
+            }
+            else
+            {
+                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
+            }
         }
-    } /* of q-loop */
+        bFirst = FALSE;
+    } /* of grid_index-loop */
+
+    /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
+     * seven terms. */
+
+    if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
+    {
+        /* Loop over A- and B-state if we are doing FEP */
+        for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
+        {
+            real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
+            if (pme->nnodes == 1)
+            {
+                if (pme->lb_buf1 == NULL)
+                {
+                    pme->lb_buf_nalloc = pme->atc[0].n;
+                    snew(pme->lb_buf1, pme->lb_buf_nalloc);
+                }
+                pme->atc[0].coefficient = pme->lb_buf1;
+                switch (fep_state)
+                {
+                    case 0:
+                        local_c6      = c6A;
+                        local_sigma   = sigmaA;
+                        break;
+                    case 1:
+                        local_c6      = c6B;
+                        local_sigma   = sigmaB;
+                        break;
+                    default:
+                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+                }
+            }
+            else
+            {
+                atc = &pme->atc[0];
+                switch (fep_state)
+                {
+                    case 0:
+                        RedistC6      = c6A;
+                        RedistSigma   = sigmaA;
+                        break;
+                    case 1:
+                        RedistC6      = c6B;
+                        RedistSigma   = sigmaB;
+                        break;
+                    default:
+                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+                }
+                wallcycle_start(wcycle, ewcPME_REDISTXF);
+
+                do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
+                if (pme->lb_buf_nalloc < atc->n)
+                {
+                    pme->lb_buf_nalloc = atc->nalloc;
+                    srenew(pme->lb_buf1, pme->lb_buf_nalloc);
+                    srenew(pme->lb_buf2, pme->lb_buf_nalloc);
+                }
+                local_c6 = pme->lb_buf1;
+                for (i = 0; i < atc->n; ++i)
+                {
+                    local_c6[i] = atc->coefficient[i];
+                }
+                where();
+
+                do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
+                local_sigma = pme->lb_buf2;
+                for (i = 0; i < atc->n; ++i)
+                {
+                    local_sigma[i] = atc->coefficient[i];
+                }
+                where();
+
+                wallcycle_stop(wcycle, ewcPME_REDISTXF);
+            }
+            calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+
+            /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
+            for (grid_index = 2; grid_index < 9; ++grid_index)
+            {
+                /* Unpack structure */
+                pmegrid    = &pme->pmegrid[grid_index];
+                fftgrid    = pme->fftgrid[grid_index];
+                cfftgrid   = pme->cfftgrid[grid_index];
+                pfft_setup = pme->pfft_setup[grid_index];
+                calc_next_lb_coeffs(pme, local_sigma);
+                grid = pmegrid->grid.grid;
+                where();
+
+                if (flags & GMX_PME_SPREAD)
+                {
+                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+                    /* Spread the c6 on a grid */
+                    spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
+
+                    if (bFirst)
+                    {
+                        inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+                    }
+
+                    inc_nrnb(nrnb, eNR_SPREADBSP,
+                             pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+                    if (pme->nthread == 1)
+                    {
+                        wrap_periodic_pmegrid(pme, grid);
+                        /* sum contributions to local grid from other nodes */
+#ifdef GMX_MPI
+                        if (pme->nnodes > 1)
+                        {
+                            gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
+                            where();
+                        }
+#endif
+                        copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
+                    }
+                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+                }
+                /*Here we start a large thread parallel region*/
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+                {
+                    thread = gmx_omp_get_thread_num();
+                    if (flags & GMX_PME_SOLVE)
+                    {
+                        /* do 3d-fft */
+                        if (thread == 0)
+                        {
+                            wallcycle_start(wcycle, ewcPME_FFT);
+                        }
+
+                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+                                                   thread, wcycle);
+                        if (thread == 0)
+                        {
+                            wallcycle_stop(wcycle, ewcPME_FFT);
+                        }
+                        where();
+                    }
+                }
+                bFirst = FALSE;
+            }
+            if (flags & GMX_PME_SOLVE)
+            {
+                /* solve in k-space for our local cells */
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+                {
+                    int loop_count;
+                    thread = gmx_omp_get_thread_num();
+                    if (thread == 0)
+                    {
+                        wallcycle_start(wcycle, ewcLJPME);
+                    }
+
+                    loop_count =
+                        solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
+                                         box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                         bCalcEnerVir,
+                                         pme->nthread, thread);
+                    if (thread == 0)
+                    {
+                        wallcycle_stop(wcycle, ewcLJPME);
+                        where();
+                        inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+                    }
+                }
+            }
+
+            if (bCalcEnerVir)
+            {
+                /* This should only be called on the master thread and
+                 * after the threads have synchronized.
+                 */
+                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
+            }
+
+            if (bCalcF)
+            {
+                bFirst = !(flags & GMX_PME_DO_COULOMB);
+                calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+                for (grid_index = 8; grid_index >= 2; --grid_index)
+                {
+                    /* Unpack structure */
+                    pmegrid    = &pme->pmegrid[grid_index];
+                    fftgrid    = pme->fftgrid[grid_index];
+                    cfftgrid   = pme->cfftgrid[grid_index];
+                    pfft_setup = pme->pfft_setup[grid_index];
+                    grid       = pmegrid->grid.grid;
+                    calc_next_lb_coeffs(pme, local_sigma);
+                    where();
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+                    {
+                        thread = gmx_omp_get_thread_num();
+                        /* do 3d-invfft */
+                        if (thread == 0)
+                        {
+                            where();
+                            wallcycle_start(wcycle, ewcPME_FFT);
+                        }
+
+                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+                                                   thread, wcycle);
+                        if (thread == 0)
+                        {
+                            wallcycle_stop(wcycle, ewcPME_FFT);
+
+                            where();
+
+                            if (pme->nodeid == 0)
+                            {
+                                ntot  = pme->nkx*pme->nky*pme->nkz;
+                                npme  = ntot*log((real)ntot)/log(2.0);
+                                inc_nrnb(nrnb, eNR_FFT, 2*npme);
+                            }
+                            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+                        }
+
+                        copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
+
+                    } /*#pragma omp parallel*/
+
+                    /* distribute local grid to all nodes */
+#ifdef GMX_MPI
+                    if (pme->nnodes > 1)
+                    {
+                        gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
+                    }
+#endif
+                    where();
+
+                    unwrap_periodic_pmegrid(pme, grid);
+
+                    /* interpolate forces for our local atoms */
+                    where();
+                    bClearF = (bFirst && PAR(cr));
+                    scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
+                    scale  *= lb_scale_factor[grid_index-2];
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+                    for (thread = 0; thread < pme->nthread; thread++)
+                    {
+                        gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
+                                          &pme->atc[0].spline[thread],
+                                          scale);
+                    }
+                    where();
+
+                    inc_nrnb(nrnb, eNR_GATHERFBSP,
+                             pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+
+                    bFirst = FALSE;
+                } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
+            }     /* if (bCalcF) */
+        }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
+    }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
 
     if (bCalcF && pme->nnodes > 1)
     {
@@ -4587,10 +5316,6 @@ int gmx_pme_do(gmx_pme_t pme,
                 dd_pmeredist_f(pme, atc, n_d, f_d,
                                d == pme->ndecompdim-1 && pme->bPPnode);
             }
-            else
-            {
-                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
-            }
         }
 
         wallcycle_stop(wcycle, ewcPME_REDISTXF);
@@ -4599,34 +5324,64 @@ int gmx_pme_do(gmx_pme_t pme,
 
     if (bCalcEnerVir)
     {
-        if (!pme->bFEP)
+        if (flags & GMX_PME_DO_COULOMB)
         {
-            *energy = energy_AB[0];
-            m_add(vir, vir_AB[0], vir);
+            if (!pme->bFEP_q)
+            {
+                *energy_q = energy_AB[0];
+                m_add(vir_q, vir_AB[0], vir_q);
+            }
+            else
+            {
+                *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
+                *dvdlambda_q   += energy_AB[1] - energy_AB[0];
+                for (i = 0; i < DIM; i++)
+                {
+                    for (j = 0; j < DIM; j++)
+                    {
+                        vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
+                            lambda_q*vir_AB[1][i][j];
+                    }
+                }
+            }
+            if (debug)
+            {
+                fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
+            }
         }
         else
         {
-            *energy     = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
-            *dvdlambda += energy_AB[1] - energy_AB[0];
-            for (i = 0; i < DIM; i++)
+            *energy_q = 0;
+        }
+
+        if (flags & GMX_PME_DO_LJ)
+        {
+            if (!pme->bFEP_lj)
+            {
+                *energy_lj = energy_AB[2];
+                m_add(vir_lj, vir_AB[2], vir_lj);
+            }
+            else
             {
-                for (j = 0; j < DIM; j++)
+                *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
+                *dvdlambda_lj += energy_AB[3] - energy_AB[2];
+                for (i = 0; i < DIM; i++)
                 {
-                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
-                        lambda*vir_AB[1][i][j];
+                    for (j = 0; j < DIM; j++)
+                    {
+                        vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
+                    }
                 }
             }
+            if (debug)
+            {
+                fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
+            }
+        }
+        else
+        {
+            *energy_lj = 0;
         }
     }
-    else
-    {
-        *energy = 0;
-    }
-
-    if (debug)
-    {
-        fprintf(debug, "PME mesh energy: %g\n", *energy);
-    }
-
     return 0;
 }