Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / mdlib / pme.c
index 6927eace8018b719b2da49435e6770c491654aa6..67a12ab58db8d0a8f5b4dc4a2f53f75214da1568 100644 (file)
@@ -1,37 +1,38 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
  *
- *
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
- *
- *          GROningen MAchine for Chemical Simulations
- *
- *                        VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
  * of the License, or (at your option) any later version.
  *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  *
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
  *
- * And Hey:
- * GROwing Monsters And Cloning Shrimps
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
  */
 /* IMPORTANT FOR DEVELOPERS:
  *
  * /Erik 001109
  */
 
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
 
-#ifdef GMX_LIB_MPI
-#include <mpi.h>
-#endif
-#ifdef GMX_THREAD_MPI
-#include "tmpi.h"
-#endif
+#include "gromacs/legacyheaders/pme.h"
+
+#include "config.h"
 
+#include <assert.h>
+#include <math.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-#include "txtdump.h"
-#include "vec.h"
-#include "gmxcomplex.h"
-#include "smalloc.h"
-#include "futil.h"
-#include "coulomb.h"
-#include "gmx_fatal.h"
-#include "pme.h"
-#include "network.h"
-#include "physics.h"
-#include "nrnb.h"
-#include "copyrite.h"
-#include "gmx_wallcycle.h"
-#include "gmx_parallel_3dfft.h"
-#include "pdbio.h"
-#include "gmx_cyclecounter.h"
-#include "gmx_omp.h"
-#include "macros.h"
-
-/* Single precision, with SSE2 or higher available */
-#if defined(GMX_X86_SSE2) && !defined(GMX_DOUBLE)
-
-#include "gmx_x86_simd_single.h"
-
-#define PME_SSE
-/* Some old AMD processors could have problems with unaligned loads+stores */
-#ifndef GMX_FAHCORE
-#define PME_SSE_UNALIGNED
+
+#include "gromacs/fft/parallel_3dfft.h"
+#include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/timing/cyclecounter.h"
+#include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
+#include "gromacs/utility/gmxmpi.h"
+#include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
+
+/* Include the SIMD macro file and then check for support */
+#include "gromacs/simd/simd.h"
+#include "gromacs/simd/simd_math.h"
+#ifdef GMX_SIMD_HAVE_REAL
+/* Turn on arbitrary width SIMD intrinsics for PME solve */
+#    define PME_SIMD_SOLVE
 #endif
+
+#define PME_GRID_QA    0 /* Gridindex for A-state for Q */
+#define PME_GRID_C6A   2 /* Gridindex for A-state for LJ */
+#define DO_Q           2 /* Electrostatic grids have index q<2 */
+#define DO_Q_AND_LJ    4 /* non-LB LJ grids have index 2 <= q < 4 */
+#define DO_Q_AND_LJ_LB 9 /* With LB rules we need a total of 2+7 grids */
+
+/* Pascal triangle coefficients scaled with (1/2)^6 for LJ-PME with LB-rules */
+const real lb_scale_factor[] = {
+    1.0/64, 6.0/64, 15.0/64, 20.0/64,
+    15.0/64, 6.0/64, 1.0/64
+};
+
+/* Pascal triangle coefficients used in solve_pme_lj_yzx, only need to do 4 calculations due to symmetry */
+const real lb_scale_factor_symm[] = { 2.0/64, 12.0/64, 30.0/64, 20.0/64 };
+
+/* Check if we have 4-wide SIMD macro support */
+#if (defined GMX_SIMD4_HAVE_REAL)
+/* Do PME spread and gather with 4-wide SIMD.
+ * NOTE: SIMD is only used with PME order 4 and 5 (which are the most common).
+ */
+#    define PME_SIMD4_SPREAD_GATHER
+
+#    if (defined GMX_SIMD_HAVE_LOADU) && (defined GMX_SIMD_HAVE_STOREU)
+/* With PME-order=4 on x86, unaligned load+store is slightly faster
+ * than doubling all SIMD operations when using aligned load+store.
+ */
+#        define PME_SIMD4_UNALIGNED
+#    endif
 #endif
 
 #define DFT_TOL 1e-7
 #define mpi_type MPI_FLOAT
 #endif
 
-/* GMX_CACHE_SEP should be a multiple of 16 to preserve alignment */
+#ifdef PME_SIMD4_SPREAD_GATHER
+#    define SIMD4_ALIGNMENT  (GMX_SIMD4_WIDTH*sizeof(real))
+#else
+/* We can use any alignment, apart from 0, so we use 4 reals */
+#    define SIMD4_ALIGNMENT  (4*sizeof(real))
+#endif
+
+/* GMX_CACHE_SEP should be a multiple of the SIMD and SIMD4 register size
+ * to preserve alignment.
+ */
 #define GMX_CACHE_SEP 64
 
 /* We only define a maximum to be able to use local arrays without allocation.
@@ -190,16 +223,16 @@ typedef struct {
     int      n;
     int      nalloc;
     rvec    *x;
-    real    *q;
+    real    *coefficient;
     rvec    *f;
     gmx_bool bSpread;       /* These coordinates are used for spreading */
     int      pme_order;
     ivec    *idx;
-    rvec    *fractx;            /* Fractional coordinate relative to the
-                                 * lower cell boundary
+    rvec    *fractx;            /* Fractional coordinate relative to
+                                 * the lower cell boundary
                                  */
     int             nthread;
-    int            *thread_idx; /* Which thread should spread which charge */
+    int            *thread_idx; /* Which thread should spread which coefficient */
     thread_plist_t *thread_plist;
     splinedata_t   *spline;
 } pme_atomcomm_t;
@@ -226,13 +259,12 @@ typedef struct {
     ivec       nthread_comm; /* The number of threads to communicate with        */
 } pmegrids_t;
 
-
 typedef struct {
-#ifdef PME_SSE
-    /* Masks for SSE aligned spreading and gathering */
-    __m128 mask_SSE0[6], mask_SSE1[6];
+#ifdef PME_SIMD4_SPREAD_GATHER
+    /* Masks for 4-wide SIMD aligned spreading and gathering */
+    gmx_simd4_bool_t mask_S0[6], mask_S1[6];
 #else
-    int    dummy; /* C89 requires that struct has at least one member */
+    int              dummy; /* C89 requires that struct has at least one member */
 #endif
 } pme_spline_work_t;
 
@@ -246,11 +278,14 @@ typedef struct {
     real *   denom;
     real *   tmp1_alloc;
     real *   tmp1;
+    real *   tmp2;
     real *   eterm;
     real *   m2inv;
 
-    real     energy;
-    matrix   vir;
+    real     energy_q;
+    matrix   vir_q;
+    real     energy_lj;
+    matrix   vir_lj;
 } pme_work_t;
 
 typedef struct gmx_pme {
@@ -268,18 +303,32 @@ typedef struct gmx_pme {
     MPI_Datatype  rvec_mpi;      /* the pme vector's MPI type */
 #endif
 
-    int        nthread;       /* The number of threads doing PME */
+    gmx_bool   bUseThreads;   /* Does any of the PME ranks have nthread>1 ?  */
+    int        nthread;       /* The number of threads doing PME on our rank */
 
     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
     gmx_bool   bFEP;          /* Compute Free energy contribution */
+    gmx_bool   bFEP_q;
+    gmx_bool   bFEP_lj;
     int        nkx, nky, nkz; /* Grid dimensions */
     gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
     int        pme_order;
     real       epsilon_r;
 
-    pmegrids_t pmegridA;  /* Grids on which we do spreading/interpolation, includes overlap */
-    pmegrids_t pmegridB;
-    /* The PME charge spreading grid sizes/strides, includes pme_order-1 */
+    int        ljpme_combination_rule;  /* Type of combination rule in LJ-PME */
+
+    int        ngrids;                  /* number of grids we maintain for pmegrid, (c)fftgrid and pfft_setups*/
+
+    pmegrids_t pmegrid[DO_Q_AND_LJ_LB]; /* Grids on which we do spreading/interpolation,
+                                         * includes overlap Grid indices are ordered as
+                                         * follows:
+                                         * 0: Coloumb PME, state A
+                                         * 1: Coloumb PME, state B
+                                         * 2-8: LJ-PME
+                                         * This can probably be done in a better way
+                                         * but this simple hack works for now
+                                         */
+    /* The PME coefficient spreading grid sizes/strides, includes pme_order-1 */
     int        pmegrid_nx, pmegrid_ny, pmegrid_nz;
     /* pmegrid_nz might be larger than strictly necessary to ensure
      * memory alignment, pmegrid_nz_base gives the real base size.
@@ -289,56 +338,50 @@ typedef struct gmx_pme {
     int     pmegrid_start_ix, pmegrid_start_iy, pmegrid_start_iz;
 
     /* Work data for spreading and gathering */
-    pme_spline_work_t    *spline_work;
+    pme_spline_work_t     *spline_work;
 
-    real                 *fftgridA; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
-    real                 *fftgridB; /* inside the interpolation grid, but separate for 2D PME decomp. */
-    int                   fftgrid_nx, fftgrid_ny, fftgrid_nz;
+    real                 **fftgrid; /* Grids for FFT. With 1D FFT decomposition this can be a pointer */
+    /* inside the interpolation grid, but separate for 2D PME decomp. */
+    int                    fftgrid_nx, fftgrid_ny, fftgrid_nz;
 
-    t_complex            *cfftgridA;  /* Grids for complex FFT data */
-    t_complex            *cfftgridB;
-    int                   cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
+    t_complex            **cfftgrid;  /* Grids for complex FFT data */
 
-    gmx_parallel_3dfft_t  pfft_setupA;
-    gmx_parallel_3dfft_t  pfft_setupB;
+    int                    cfftgrid_nx, cfftgrid_ny, cfftgrid_nz;
 
-    int                  *nnx, *nny, *nnz;
-    real                 *fshx, *fshy, *fshz;
+    gmx_parallel_3dfft_t  *pfft_setup;
 
-    pme_atomcomm_t        atc[2]; /* Indexed on decomposition index */
-    matrix                recipbox;
-    splinevec             bsp_mod;
+    int                   *nnx, *nny, *nnz;
+    real                  *fshx, *fshy, *fshz;
 
-    pme_overlap_t         overlap[2]; /* Indexed on dimension, 0=x, 1=y */
+    pme_atomcomm_t         atc[2]; /* Indexed on decomposition index */
+    matrix                 recipbox;
+    splinevec              bsp_mod;
+    /* Buffers to store data for local atoms for L-B combination rule
+     * calculations in LJ-PME. lb_buf1 stores either the coefficients
+     * for spreading/gathering (in serial), or the C6 coefficient for
+     * local atoms (in parallel).  lb_buf2 is only used in parallel,
+     * and stores the sigma values for local atoms. */
+    real                 *lb_buf1, *lb_buf2;
+    int                   lb_buf_nalloc; /* Allocation size for the above buffers. */
 
-    pme_atomcomm_t        atc_energy; /* Only for gmx_pme_calc_energy */
+    pme_overlap_t         overlap[2];    /* Indexed on dimension, 0=x, 1=y */
 
-    rvec                 *bufv;       /* Communication buffer */
-    real                 *bufr;       /* Communication buffer */
-    int                   buf_nalloc; /* The communication buffer size */
+    pme_atomcomm_t        atc_energy;    /* Only for gmx_pme_calc_energy */
+
+    rvec                 *bufv;          /* Communication buffer */
+    real                 *bufr;          /* Communication buffer */
+    int                   buf_nalloc;    /* The communication buffer size */
 
     /* thread local work data for solve_pme */
     pme_work_t *work;
 
-    /* Work data for PME_redist */
-    gmx_bool redist_init;
-    int *    scounts;
-    int *    rcounts;
-    int *    sdispls;
-    int *    rdispls;
-    int *    sidx;
-    int *    idxa;
-    real *   redist_buf;
-    int      redist_buf_nalloc;
-
     /* Work data for sum_qgrid */
     real *   sum_qgrid_tmp;
     real *   sum_qgrid_dd_tmp;
 } t_gmx_pme;
 
-
 static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
-                                   int start, int end, int thread)
+                                   int start, int grid_index, int end, int thread)
 {
     int             i;
     int            *idxptr, tix, tiy, tiz;
@@ -368,9 +411,9 @@ static void calc_interpolation_idx(gmx_pme_t pme, pme_atomcomm_t *atc,
     rzy = pme->recipbox[ZZ][YY];
     rzz = pme->recipbox[ZZ][ZZ];
 
-    g2tx = pme->pmegridA.g2t[XX];
-    g2ty = pme->pmegridA.g2t[YY];
-    g2tz = pme->pmegridA.g2t[ZZ];
+    g2tx = pme->pmegrid[grid_index].g2t[XX];
+    g2ty = pme->pmegrid[grid_index].g2t[YY];
+    g2tz = pme->pmegrid[grid_index].g2t[ZZ];
 
     bThreads = (atc->nthread > 1);
     if (bThreads)
@@ -577,8 +620,9 @@ static void realloc_splinevec(splinevec th, real **ptr_z, int nalloc)
 
     srenew(th[XX], nalloc);
     srenew(th[YY], nalloc);
-    /* In z we add padding, this is only required for the aligned SSE code */
-    srenew(*ptr_z, nalloc+2*padding);
+    /* In z we add padding, this is only required for the aligned SIMD code */
+    sfree_aligned(*ptr_z);
+    snew_aligned(*ptr_z, nalloc+2*padding, SIMD4_ALIGNMENT);
     th[ZZ] = *ptr_z + padding;
 
     for (i = 0; i < padding; i++)
@@ -620,7 +664,7 @@ static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
         if (atc->nslab > 1)
         {
             srenew(atc->x, atc->nalloc);
-            srenew(atc->q, atc->nalloc);
+            srenew(atc->coefficient, atc->nalloc);
             srenew(atc->f, atc->nalloc);
             for (i = nalloc_old; i < atc->nalloc; i++)
             {
@@ -645,121 +689,10 @@ static void pme_realloc_atomcomm_things(pme_atomcomm_t *atc)
     }
 }
 
-static void pmeredist_pd(gmx_pme_t pme, gmx_bool forw,
-                         int n, gmx_bool bXF, rvec *x_f, real *charge,
-                         pme_atomcomm_t *atc)
-/* Redistribute particle data for PME calculation */
-/* domain decomposition by x coordinate           */
-{
-    int *idxa;
-    int  i, ii;
-
-    if (FALSE == pme->redist_init)
-    {
-        snew(pme->scounts, atc->nslab);
-        snew(pme->rcounts, atc->nslab);
-        snew(pme->sdispls, atc->nslab);
-        snew(pme->rdispls, atc->nslab);
-        snew(pme->sidx, atc->nslab);
-        pme->redist_init = TRUE;
-    }
-    if (n > pme->redist_buf_nalloc)
-    {
-        pme->redist_buf_nalloc = over_alloc_dd(n);
-        srenew(pme->redist_buf, pme->redist_buf_nalloc*DIM);
-    }
-
-    pme->idxa = atc->pd;
-
-#ifdef GMX_MPI
-    if (forw && bXF)
-    {
-        /* forward, redistribution from pp to pme */
-
-        /* Calculate send counts and exchange them with other nodes */
-        for (i = 0; (i < atc->nslab); i++)
-        {
-            pme->scounts[i] = 0;
-        }
-        for (i = 0; (i < n); i++)
-        {
-            pme->scounts[pme->idxa[i]]++;
-        }
-        MPI_Alltoall( pme->scounts, 1, MPI_INT, pme->rcounts, 1, MPI_INT, atc->mpi_comm);
-
-        /* Calculate send and receive displacements and index into send
-           buffer */
-        pme->sdispls[0] = 0;
-        pme->rdispls[0] = 0;
-        pme->sidx[0]    = 0;
-        for (i = 1; i < atc->nslab; i++)
-        {
-            pme->sdispls[i] = pme->sdispls[i-1]+pme->scounts[i-1];
-            pme->rdispls[i] = pme->rdispls[i-1]+pme->rcounts[i-1];
-            pme->sidx[i]    = pme->sdispls[i];
-        }
-        /* Total # of particles to be received */
-        atc->n = pme->rdispls[atc->nslab-1] + pme->rcounts[atc->nslab-1];
-
-        pme_realloc_atomcomm_things(atc);
-
-        /* Copy particle coordinates into send buffer and exchange*/
-        for (i = 0; (i < n); i++)
-        {
-            ii = DIM*pme->sidx[pme->idxa[i]];
-            pme->sidx[pme->idxa[i]]++;
-            pme->redist_buf[ii+XX] = x_f[i][XX];
-            pme->redist_buf[ii+YY] = x_f[i][YY];
-            pme->redist_buf[ii+ZZ] = x_f[i][ZZ];
-        }
-        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls,
-                      pme->rvec_mpi, atc->x, pme->rcounts, pme->rdispls,
-                      pme->rvec_mpi, atc->mpi_comm);
-    }
-    if (forw)
-    {
-        /* Copy charge into send buffer and exchange*/
-        for (i = 0; i < atc->nslab; i++)
-        {
-            pme->sidx[i] = pme->sdispls[i];
-        }
-        for (i = 0; (i < n); i++)
-        {
-            ii = pme->sidx[pme->idxa[i]];
-            pme->sidx[pme->idxa[i]]++;
-            pme->redist_buf[ii] = charge[i];
-        }
-        MPI_Alltoallv(pme->redist_buf, pme->scounts, pme->sdispls, mpi_type,
-                      atc->q, pme->rcounts, pme->rdispls, mpi_type,
-                      atc->mpi_comm);
-    }
-    else   /* backward, redistribution from pme to pp */
-    {
-        MPI_Alltoallv(atc->f, pme->rcounts, pme->rdispls, pme->rvec_mpi,
-                      pme->redist_buf, pme->scounts, pme->sdispls,
-                      pme->rvec_mpi, atc->mpi_comm);
-
-        /* Copy data from receive buffer */
-        for (i = 0; i < atc->nslab; i++)
-        {
-            pme->sidx[i] = pme->sdispls[i];
-        }
-        for (i = 0; (i < n); i++)
-        {
-            ii          = DIM*pme->sidx[pme->idxa[i]];
-            x_f[i][XX] += pme->redist_buf[ii+XX];
-            x_f[i][YY] += pme->redist_buf[ii+YY];
-            x_f[i][ZZ] += pme->redist_buf[ii+ZZ];
-            pme->sidx[pme->idxa[i]]++;
-        }
-    }
-#endif
-}
-
-static void pme_dd_sendrecv(pme_atomcomm_t *atc,
-                            gmx_bool bBackward, int shift,
-                            void *buf_s, int nbyte_s,
-                            void *buf_r, int nbyte_r)
+static void pme_dd_sendrecv(pme_atomcomm_t gmx_unused *atc,
+                            gmx_bool gmx_unused bBackward, int gmx_unused shift,
+                            void gmx_unused *buf_s, int gmx_unused nbyte_s,
+                            void gmx_unused *buf_r, int gmx_unused nbyte_r)
 {
 #ifdef GMX_MPI
     int        dest, src;
@@ -799,9 +732,9 @@ static void pme_dd_sendrecv(pme_atomcomm_t *atc,
 #endif
 }
 
-static void dd_pmeredist_x_q(gmx_pme_t pme,
-                             int n, gmx_bool bX, rvec *x, real *charge,
-                             pme_atomcomm_t *atc)
+static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
+                                    int n, gmx_bool bX, rvec *x, real *data,
+                                    pme_atomcomm_t *atc)
 {
     int *commnode, *buf_index;
     int  nnodes_comm, i, nsend, local_pos, buf_pos, node, scount, rcount;
@@ -821,7 +754,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
     {
         if (atc->count[atc->nodeid] + nsend != n)
         {
-            gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+            gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
                       "This usually means that your system is not well equilibrated.",
                       n - (atc->count[atc->nodeid] + nsend),
                       pme->nodeid, 'x'+atc->dimind);
@@ -841,7 +774,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
             /* Communicate the count */
             if (debug)
             {
-                fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+                fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
                         atc->dimind, atc->nodeid, commnode[i], scount);
             }
             pme_dd_sendrecv(atc, FALSE, i,
@@ -864,7 +797,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
             {
                 copy_rvec(x[i], atc->x[local_pos]);
             }
-            atc->q[local_pos] = charge[i];
+            atc->coefficient[local_pos] = data[i];
             local_pos++;
         }
         else
@@ -874,7 +807,7 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
             {
                 copy_rvec(x[i], pme->bufv[buf_index[node]]);
             }
-            pme->bufr[buf_index[node]] = charge[i];
+            pme->bufr[buf_index[node]] = data[i];
             buf_index[node]++;
         }
     }
@@ -893,10 +826,10 @@ static void dd_pmeredist_x_q(gmx_pme_t pme,
                                 pme->bufv[buf_pos], scount*sizeof(rvec),
                                 atc->x[local_pos], rcount*sizeof(rvec));
             }
-            /* Communicate the charges */
+            /* Communicate the coefficients */
             pme_dd_sendrecv(atc, FALSE, i,
                             pme->bufr+buf_pos, scount*sizeof(real),
-                            atc->q+local_pos, rcount*sizeof(real));
+                            atc->coefficient+local_pos, rcount*sizeof(real));
             buf_pos   += scount;
             local_pos += atc->rcount[i];
         }
@@ -975,8 +908,7 @@ static void dd_pmeredist_f(gmx_pme_t pme, pme_atomcomm_t *atc,
 }
 
 #ifdef GMX_MPI
-static void
-gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
+static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 {
     pme_overlap_t *overlap;
     int            send_index0, send_nindex;
@@ -995,7 +927,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Since we have already (un)wrapped the overlap in the z-dimension,
          * we only have to communicate 0 to nkz (not pmegrid_nz).
          */
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
         {
             send_id       = overlap->send_id[ipulse];
             recv_id       = overlap->recv_id[ipulse];
@@ -1017,7 +949,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Copy data to contiguous send buffer */
         if (debug)
         {
-            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, send_id,
                     pme->pmegrid_start_iy,
                     send_index0-pme->pmegrid_start_iy,
@@ -1049,7 +981,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Get data from contiguous recv buffer */
         if (debug)
         {
-            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, recv_id,
                     pme->pmegrid_start_iy,
                     recv_index0-pme->pmegrid_start_iy,
@@ -1065,7 +997,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                 for (k = 0; k < pme->nkz; k++)
                 {
                     iz = k;
-                    if (direction == GMX_SUM_QGRID_FORWARD)
+                    if (direction == GMX_SUM_GRID_FORWARD)
                     {
                         grid[ix*(pme->pmegrid_ny*pme->pmegrid_nz)+iy*(pme->pmegrid_nz)+iz] += overlap->recvbuf[icnt++];
                     }
@@ -1087,7 +1019,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 
     for (ipulse = 0; ipulse < overlap->noverlap_nodes; ipulse++)
     {
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
         {
             send_id       = overlap->send_id[ipulse];
             recv_id       = overlap->recv_id[ipulse];
@@ -1113,12 +1045,12 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 
         if (debug)
         {
-            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, send_id,
                     pme->pmegrid_start_ix,
                     send_index0-pme->pmegrid_start_ix,
                     send_index0-pme->pmegrid_start_ix+send_nindex);
-            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, recv_id,
                     pme->pmegrid_start_ix,
                     recv_index0-pme->pmegrid_start_ix,
@@ -1132,7 +1064,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
                      overlap->mpi_comm, &stat);
 
         /* ADD data from contiguous recv buffer */
-        if (direction == GMX_SUM_QGRID_FORWARD)
+        if (direction == GMX_SUM_GRID_FORWARD)
         {
             p = grid + (recv_index0-pme->pmegrid_start_ix)*(pme->pmegrid_ny*pme->pmegrid_nz);
             for (i = 0; i < recv_nindex*datasize; i++)
@@ -1145,8 +1077,7 @@ gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 #endif
 
 
-static int
-copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
+static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid, int grid_index)
 {
     ivec    local_fft_ndata, local_fft_offset, local_fft_size;
     ivec    local_pme_size;
@@ -1154,7 +1085,7 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
     int     pmeidx, fftidx;
 
     /* Dimensions should be identical for A/B grid, so we just use A here */
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -1169,13 +1100,12 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
     {
 #ifdef DEBUG_PME
         FILE *fp, *fp2;
-        char  fn[STRLEN], format[STRLEN];
+        char  fn[STRLEN];
         real  val;
         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
-        fp = ffopen(fn, "w");
+        fp = gmx_ffopen(fn, "w");
         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
-        fp2 = ffopen(fn, "w");
-        sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
+        fp2 = gmx_ffopen(fn, "w");
 #endif
 
         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
@@ -1191,8 +1121,8 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
                     val = 100*pmegrid[pmeidx];
                     if (pmegrid[pmeidx] != 0)
                     {
-                        fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
-                                5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+                        gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
+                                                 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
                     }
                     if (pmegrid[pmeidx] != 0)
                     {
@@ -1208,8 +1138,8 @@ copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid)
             }
         }
 #ifdef DEBUG_PME
-        ffclose(fp);
-        ffclose(fp2);
+        gmx_ffclose(fp);
+        gmx_ffclose(fp2);
 #endif
     }
     return 0;
@@ -1227,9 +1157,8 @@ static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
 }
 
 
-static int
-copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
-                        int nthread, int thread)
+static int copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid, int grid_index,
+                                   int nthread, int thread)
 {
     ivec          local_fft_ndata, local_fft_offset, local_fft_size;
     ivec          local_pme_size;
@@ -1245,7 +1174,7 @@ copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
     c1 = omp_cyc_start();
 #endif
     /* Dimensions should be identical for A/B grid, so we just use A here */
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -1287,8 +1216,7 @@ copy_fftgrid_to_pmegrid(gmx_pme_t pme, const real *fftgrid, real *pmegrid,
 }
 
 
-static void
-wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+static void wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 {
     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix, iy, iz;
 
@@ -1349,8 +1277,7 @@ wrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 }
 
 
-static void
-unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
+static void unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
 {
     int     nx, ny, nz, pnx, pny, pnz, ny_x, overlap, ix;
 
@@ -1418,58 +1345,13 @@ unwrap_periodic_pmegrid(gmx_pme_t pme, real *pmegrid)
     }
 }
 
-static void clear_grid(int nx, int ny, int nz, real *grid,
-                       ivec fs, int *flag,
-                       int fx, int fy, int fz,
-                       int order)
-{
-    int nc, ncz;
-    int fsx, fsy, fsz, gx, gy, gz, g0x, g0y, x, y, z;
-    int flind;
-
-    nc  = 2 + (order - 2)/FLBS;
-    ncz = 2 + (order - 2)/FLBSZ;
-
-    for (fsx = fx; fsx < fx+nc; fsx++)
-    {
-        for (fsy = fy; fsy < fy+nc; fsy++)
-        {
-            for (fsz = fz; fsz < fz+ncz; fsz++)
-            {
-                flind = (fsx*fs[YY] + fsy)*fs[ZZ] + fsz;
-                if (flag[flind] == 0)
-                {
-                    gx  = fsx*FLBS;
-                    gy  = fsy*FLBS;
-                    gz  = fsz*FLBSZ;
-                    g0x = (gx*ny + gy)*nz + gz;
-                    for (x = 0; x < FLBS; x++)
-                    {
-                        g0y = g0x;
-                        for (y = 0; y < FLBS; y++)
-                        {
-                            for (z = 0; z < FLBSZ; z++)
-                            {
-                                grid[g0y+z] = 0;
-                            }
-                            g0y += nz;
-                        }
-                        g0x += ny*nz;
-                    }
-
-                    flag[flind] = 1;
-                }
-            }
-        }
-    }
-}
 
 /* This has to be a macro to enable full compiler optimization with xlC (and probably others too) */
 #define DO_BSPLINE(order)                            \
     for (ithx = 0; (ithx < order); ithx++)                    \
     {                                                    \
         index_x = (i0+ithx)*pny*pnz;                     \
-        valx    = qn*thx[ithx];                          \
+        valx    = coefficient*thx[ithx];                          \
                                                      \
         for (ithy = 0; (ithy < order); ithy++)                \
         {                                                \
@@ -1485,23 +1367,30 @@ static void clear_grid(int nx, int ny, int nz, real *grid,
     }
 
 
-static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
-                                     pme_atomcomm_t *atc, splinedata_t *spline,
-                                     pme_spline_work_t *work)
+static void spread_coefficients_bsplines_thread(pmegrid_t                    *pmegrid,
+                                                pme_atomcomm_t               *atc,
+                                                splinedata_t                 *spline,
+                                                pme_spline_work_t gmx_unused *work)
 {
 
-    /* spread charges from home atoms to local grid */
+    /* spread coefficients from home atoms to local grid */
     real          *grid;
     pme_overlap_t *ol;
     int            b, i, nn, n, ithx, ithy, ithz, i0, j0, k0;
     int       *    idxptr;
     int            order, norder, index_x, index_xy, index_xyz;
-    real           valx, valxy, qn;
+    real           valx, valxy, coefficient;
     real          *thx, *thy, *thz;
     int            localsize, bndsize;
     int            pnx, pny, pnz, ndatatot;
     int            offx, offy, offz;
 
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+    real           thz_buffer[GMX_SIMD4_WIDTH*3], *thz_aligned;
+
+    thz_aligned = gmx_simd4_align_r(thz_buffer);
+#endif
+
     pnx = pmegrid->s[XX];
     pny = pmegrid->s[YY];
     pnz = pmegrid->s[ZZ];
@@ -1521,10 +1410,10 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
 
     for (nn = 0; nn < spline->n; nn++)
     {
-        n  = spline->ind[nn];
-        qn = atc->q[n];
+        n           = spline->ind[nn];
+        coefficient = atc->coefficient[n];
 
-        if (qn != 0)
+        if (coefficient != 0)
         {
             idxptr = atc->idx[n];
             norder = nn*order;
@@ -1540,23 +1429,23 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
             switch (order)
             {
                 case 4:
-#ifdef PME_SSE
-#ifdef PME_SSE_UNALIGNED
-#define PME_SPREAD_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_SPREAD_SIMD4_ORDER4
 #else
-#define PME_SPREAD_SSE_ALIGNED
+#define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_BSPLINE(4);
 #endif
                     break;
                 case 5:
-#ifdef PME_SSE
-#define PME_SPREAD_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_BSPLINE(5);
 #endif
@@ -1569,11 +1458,11 @@ static void spread_q_bsplines_thread(pmegrid_t *pmegrid,
     }
 }
 
-static void set_grid_alignment(int *pmegrid_nz, int pme_order)
+static void set_grid_alignment(int gmx_unused *pmegrid_nz, int gmx_unused pme_order)
 {
-#ifdef PME_SSE
+#ifdef PME_SIMD4_SPREAD_GATHER
     if (pme_order == 5
-#ifndef PME_SSE_UNALIGNED
+#ifndef PME_SIMD4_UNALIGNED
         || pme_order == 4
 #endif
         )
@@ -1584,10 +1473,10 @@ static void set_grid_alignment(int *pmegrid_nz, int pme_order)
 #endif
 }
 
-static void set_gridsize_alignment(int *gridsize, int pme_order)
+static void set_gridsize_alignment(int gmx_unused *gridsize, int gmx_unused pme_order)
 {
-#ifdef PME_SSE
-#ifndef PME_SSE_UNALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifndef PME_SIMD4_UNALIGNED
     if (pme_order == 4)
     {
         /* Add extra elements to ensured aligned operations do not go
@@ -1638,7 +1527,7 @@ static void pmegrid_init(pmegrid_t *grid,
     {
         gridsize = grid->s[XX]*grid->s[YY]*grid->s[ZZ];
         set_gridsize_alignment(&gridsize, pme_order);
-        snew_aligned(grid->grid, gridsize, 16);
+        snew_aligned(grid->grid, gridsize, SIMD4_ALIGNMENT);
     }
     else
     {
@@ -1708,6 +1597,7 @@ static void make_subgrid_division(const ivec n, int ovl, int nthread,
 static void pmegrids_init(pmegrids_t *grids,
                           int nx, int ny, int nz, int nz_base,
                           int pme_order,
+                          gmx_bool bUseThreads,
                           int nthread,
                           int overlap_x,
                           int overlap_y)
@@ -1730,7 +1620,7 @@ static void pmegrids_init(pmegrids_t *grids,
 
     make_subgrid_division(n_base, pme_order-1, grids->nthread, grids->nc);
 
-    if (grids->nthread > 1)
+    if (bUseThreads)
     {
         ivec nst;
         int gridsize;
@@ -1756,7 +1646,7 @@ static void pmegrids_init(pmegrids_t *grids,
         set_gridsize_alignment(&gridsize, pme_order);
         snew_aligned(grids->grid_all,
                      grids->nthread*gridsize+(grids->nthread+1)*GMX_CACHE_SEP,
-                     16);
+                     SIMD4_ALIGNMENT);
 
         for (x = 0; x < grids->nc[XX]; x++)
         {
@@ -1780,6 +1670,10 @@ static void pmegrids_init(pmegrids_t *grids,
             }
         }
     }
+    else
+    {
+        grids->grid_th = NULL;
+    }
 
     snew(grids->g2t, DIM);
     tfac = 1;
@@ -1851,6 +1745,8 @@ static void pmegrids_destroy(pmegrids_t *grids)
 
 static void realloc_work(pme_work_t *work, int nkx)
 {
+    int simd_width;
+
     if (nkx > work->nalloc)
     {
         work->nalloc = nkx;
@@ -1858,15 +1754,23 @@ static void realloc_work(pme_work_t *work, int nkx)
         srenew(work->mhy, work->nalloc);
         srenew(work->mhz, work->nalloc);
         srenew(work->m2, work->nalloc);
-        /* Allocate an aligned pointer for SSE operations, including 3 extra
-         * elements at the end since SSE operates on 4 elements at a time.
+        /* Allocate an aligned pointer for SIMD operations, including extra
+         * elements at the end for padding.
          */
+#ifdef PME_SIMD_SOLVE
+        simd_width = GMX_SIMD_REAL_WIDTH;
+#else
+        /* We can use any alignment, apart from 0, so we use 4 */
+        simd_width = 4;
+#endif
         sfree_aligned(work->denom);
         sfree_aligned(work->tmp1);
+        sfree_aligned(work->tmp2);
         sfree_aligned(work->eterm);
-        snew_aligned(work->denom, work->nalloc+3, 16);
-        snew_aligned(work->tmp1, work->nalloc+3, 16);
-        snew_aligned(work->eterm, work->nalloc+3, 16);
+        snew_aligned(work->denom, work->nalloc+simd_width, simd_width*sizeof(real));
+        snew_aligned(work->tmp1,  work->nalloc+simd_width, simd_width*sizeof(real));
+        snew_aligned(work->tmp2,  work->nalloc+simd_width, simd_width*sizeof(real));
+        snew_aligned(work->eterm, work->nalloc+simd_width, simd_width*sizeof(real));
         srenew(work->m2inv, work->nalloc);
     }
 }
@@ -1880,37 +1784,40 @@ static void free_work(pme_work_t *work)
     sfree(work->m2);
     sfree_aligned(work->denom);
     sfree_aligned(work->tmp1);
+    sfree_aligned(work->tmp2);
     sfree_aligned(work->eterm);
     sfree(work->m2inv);
 }
 
 
-#ifdef PME_SSE
-/* Calculate exponentials through SSE in float precision */
-inline static void calc_exponentials(int start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
+#if defined PME_SIMD_SOLVE
+/* Calculate exponentials through SIMD */
+gmx_inline static void calc_exponentials_q(int gmx_unused start, int end, real f, real *d_aligned, real *r_aligned, real *e_aligned)
 {
     {
-        const __m128 two = _mm_set_ps(2.0f, 2.0f, 2.0f, 2.0f);
-        __m128 f_sse;
-        __m128 lu;
-        __m128 tmp_d1, d_inv, tmp_r, tmp_e;
+        const gmx_simd_real_t two = gmx_simd_set1_r(2.0);
+        gmx_simd_real_t f_simd;
+        gmx_simd_real_t lu;
+        gmx_simd_real_t tmp_d1, d_inv, tmp_r, tmp_e;
         int kx;
-        f_sse = _mm_load1_ps(&f);
-        for (kx = 0; kx < end; kx += 4)
+        f_simd = gmx_simd_set1_r(f);
+        /* We only need to calculate from start. But since start is 0 or 1
+         * and we want to use aligned loads/stores, we always start from 0.
+         */
+        for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
         {
-            tmp_d1   = _mm_load_ps(d_aligned+kx);
-            lu       = _mm_rcp_ps(tmp_d1);
-            d_inv    = _mm_mul_ps(lu, _mm_sub_ps(two, _mm_mul_ps(lu, tmp_d1)));
-            tmp_r    = _mm_load_ps(r_aligned+kx);
-            tmp_r    = gmx_mm_exp_ps(tmp_r);
-            tmp_e    = _mm_mul_ps(f_sse, d_inv);
-            tmp_e    = _mm_mul_ps(tmp_e, tmp_r);
-            _mm_store_ps(e_aligned+kx, tmp_e);
+            tmp_d1   = gmx_simd_load_r(d_aligned+kx);
+            d_inv    = gmx_simd_inv_r(tmp_d1);
+            tmp_r    = gmx_simd_load_r(r_aligned+kx);
+            tmp_r    = gmx_simd_exp_r(tmp_r);
+            tmp_e    = gmx_simd_mul_r(f_simd, d_inv);
+            tmp_e    = gmx_simd_mul_r(tmp_e, tmp_r);
+            gmx_simd_store_r(e_aligned+kx, tmp_e);
         }
     }
 }
 #else
-inline static void calc_exponentials(int start, int end, real f, real *d, real *r, real *e)
+gmx_inline static void calc_exponentials_q(int start, int end, real f, real *d, real *r, real *e)
 {
     int kx;
     for (kx = start; kx < end; kx++)
@@ -1928,6 +1835,51 @@ inline static void calc_exponentials(int start, int end, real f, real *d, real *
 }
 #endif
 
+#if defined PME_SIMD_SOLVE
+/* Calculate exponentials through SIMD */
+gmx_inline static void calc_exponentials_lj(int gmx_unused start, int end, real *r_aligned, real *factor_aligned, real *d_aligned)
+{
+    gmx_simd_real_t tmp_r, tmp_d, tmp_fac, d_inv, tmp_mk;
+    const gmx_simd_real_t sqr_PI = gmx_simd_sqrt_r(gmx_simd_set1_r(M_PI));
+    int kx;
+    for (kx = 0; kx < end; kx += GMX_SIMD_REAL_WIDTH)
+    {
+        /* We only need to calculate from start. But since start is 0 or 1
+         * and we want to use aligned loads/stores, we always start from 0.
+         */
+        tmp_d = gmx_simd_load_r(d_aligned+kx);
+        d_inv = gmx_simd_inv_r(tmp_d);
+        gmx_simd_store_r(d_aligned+kx, d_inv);
+        tmp_r = gmx_simd_load_r(r_aligned+kx);
+        tmp_r = gmx_simd_exp_r(tmp_r);
+        gmx_simd_store_r(r_aligned+kx, tmp_r);
+        tmp_mk  = gmx_simd_load_r(factor_aligned+kx);
+        tmp_fac = gmx_simd_mul_r(sqr_PI, gmx_simd_mul_r(tmp_mk, gmx_simd_erfc_r(tmp_mk)));
+        gmx_simd_store_r(factor_aligned+kx, tmp_fac);
+    }
+}
+#else
+gmx_inline static void calc_exponentials_lj(int start, int end, real *r, real *tmp2, real *d)
+{
+    int kx;
+    real mk;
+    for (kx = start; kx < end; kx++)
+    {
+        d[kx] = 1.0/d[kx];
+    }
+
+    for (kx = start; kx < end; kx++)
+    {
+        r[kx] = exp(r[kx]);
+    }
+
+    for (kx = start; kx < end; kx++)
+    {
+        mk       = tmp2[kx];
+        tmp2[kx] = sqrt(M_PI)*mk*gmx_erfc(mk);
+    }
+}
+#endif
 
 static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
                          real ewaldcoeff, real vol,
@@ -1961,7 +1913,7 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
     nz = pme->nkz;
 
     /* Dimensions should be identical for A/B grid, so we just use A here */
-    gmx_parallel_3dfft_complex_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_QA],
                                       complex_order,
                                       local_ndata,
                                       local_offset,
@@ -2025,6 +1977,7 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
         p0 = grid + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
 
         /* We should skip the k-space point (0,0,0) */
+        /* Note that since here x is the minor index, local_offset[XX]=0 */
         if (local_offset[XX] > 0 || ky > 0 || kz > 0)
         {
             kxstart = local_offset[XX];
@@ -2082,7 +2035,7 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
                 m2inv[kx] = 1.0/m2[kx];
             }
 
-            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+            calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
 
             for (kx = kxstart; kx < kxend; kx++, p0++)
             {
@@ -2144,7 +2097,7 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
                 tmp1[kx]  = -factor*m2k;
             }
 
-            calc_exponentials(kxstart, kxend, elfac, denom, tmp1, eterm);
+            calc_exponentials_q(kxstart, kxend, elfac, denom, tmp1, eterm);
 
             for (kx = kxstart; kx < kxend; kx++, p0++)
             {
@@ -2165,127 +2118,464 @@ static int solve_pme_yzx(gmx_pme_t pme, t_complex *grid,
          * experiencing problems on semiisotropic membranes.
          * IS THAT COMMENT STILL VALID??? (DvdS, 2001/02/07).
          */
-        work->vir[XX][XX] = 0.25*virxx;
-        work->vir[YY][YY] = 0.25*viryy;
-        work->vir[ZZ][ZZ] = 0.25*virzz;
-        work->vir[XX][YY] = work->vir[YY][XX] = 0.25*virxy;
-        work->vir[XX][ZZ] = work->vir[ZZ][XX] = 0.25*virxz;
-        work->vir[YY][ZZ] = work->vir[ZZ][YY] = 0.25*viryz;
+        work->vir_q[XX][XX] = 0.25*virxx;
+        work->vir_q[YY][YY] = 0.25*viryy;
+        work->vir_q[ZZ][ZZ] = 0.25*virzz;
+        work->vir_q[XX][YY] = work->vir_q[YY][XX] = 0.25*virxy;
+        work->vir_q[XX][ZZ] = work->vir_q[ZZ][XX] = 0.25*virxz;
+        work->vir_q[YY][ZZ] = work->vir_q[ZZ][YY] = 0.25*viryz;
 
         /* This energy should be corrected for a charged system */
-        work->energy = 0.5*energy;
+        work->energy_q = 0.5*energy;
     }
 
     /* Return the loop count */
     return local_ndata[YY]*local_ndata[XX];
 }
 
-static void get_pme_ener_vir(const gmx_pme_t pme, int nthread,
-                             real *mesh_energy, matrix vir)
+static int solve_pme_lj_yzx(gmx_pme_t pme, t_complex **grid, gmx_bool bLB,
+                            real ewaldcoeff, real vol,
+                            gmx_bool bEnerVir, int nthread, int thread)
 {
-    /* This function sums output over threads
-     * and should therefore only be called after thread synchronization.
-     */
-    int thread;
-
-    *mesh_energy = pme->work[0].energy;
-    copy_mat(pme->work[0].vir, vir);
-
-    for (thread = 1; thread < nthread; thread++)
-    {
-        *mesh_energy += pme->work[thread].energy;
-        m_add(vir, pme->work[thread].vir, vir);
-    }
-}
-
-#define DO_FSPLINE(order)                      \
-    for (ithx = 0; (ithx < order); ithx++)              \
-    {                                              \
-        index_x = (i0+ithx)*pny*pnz;               \
-        tx      = thx[ithx];                       \
-        dx      = dthx[ithx];                      \
-                                               \
-        for (ithy = 0; (ithy < order); ithy++)          \
-        {                                          \
-            index_xy = index_x+(j0+ithy)*pnz;      \
-            ty       = thy[ithy];                  \
-            dy       = dthy[ithy];                 \
-            fxy1     = fz1 = 0;                    \
-                                               \
-            for (ithz = 0; (ithz < order); ithz++)      \
-            {                                      \
-                gval  = grid[index_xy+(k0+ithz)];  \
-                fxy1 += thz[ithz]*gval;            \
-                fz1  += dthz[ithz]*gval;           \
-            }                                      \
-            fx += dx*ty*fxy1;                      \
-            fy += tx*dy*fxy1;                      \
-            fz += tx*ty*fz1;                       \
-        }                                          \
-    }
-
-
-static void gather_f_bsplines(gmx_pme_t pme, real *grid,
-                              gmx_bool bClearF, pme_atomcomm_t *atc,
-                              splinedata_t *spline,
-                              real scale)
-{
-    /* sum forces for local particles */
-    int     nn, n, ithx, ithy, ithz, i0, j0, k0;
-    int     index_x, index_xy;
-    int     nx, ny, nz, pnx, pny, pnz;
-    int *   idxptr;
-    real    tx, ty, dx, dy, qn;
-    real    fx, fy, fz, gval;
-    real    fxy1, fz1;
-    real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
-    int     norder;
+    /* do recip sum over local cells in grid */
+    /* y major, z middle, x minor or continuous */
+    int     ig, gcount;
+    int     kx, ky, kz, maxkx, maxky, maxkz;
+    int     nx, ny, nz, iy, iyz0, iyz1, iyz, iz, kxstart, kxend;
+    real    mx, my, mz;
+    real    factor = M_PI*M_PI/(ewaldcoeff*ewaldcoeff);
+    real    ets2, ets2vf;
+    real    eterm, vterm, d1, d2, energy = 0;
+    real    by, bz;
+    real    virxx = 0, virxy = 0, virxz = 0, viryy = 0, viryz = 0, virzz = 0;
     real    rxx, ryx, ryy, rzx, rzy, rzz;
-    int     order;
+    real    *mhx, *mhy, *mhz, *m2, *denom, *tmp1, *tmp2;
+    real    mhxk, mhyk, mhzk, m2k;
+    real    mk;
+    pme_work_t *work;
+    real    corner_fac;
+    ivec    complex_order;
+    ivec    local_ndata, local_offset, local_size;
+    nx = pme->nkx;
+    ny = pme->nky;
+    nz = pme->nkz;
 
-    pme_spline_work_t *work;
+    /* Dimensions should be identical for A/B grid, so we just use A here */
+    gmx_parallel_3dfft_complex_limits(pme->pfft_setup[PME_GRID_C6A],
+                                      complex_order,
+                                      local_ndata,
+                                      local_offset,
+                                      local_size);
+    rxx = pme->recipbox[XX][XX];
+    ryx = pme->recipbox[YY][XX];
+    ryy = pme->recipbox[YY][YY];
+    rzx = pme->recipbox[ZZ][XX];
+    rzy = pme->recipbox[ZZ][YY];
+    rzz = pme->recipbox[ZZ][ZZ];
 
-    work = pme->spline_work;
+    maxkx = (nx+1)/2;
+    maxky = (ny+1)/2;
+    maxkz = nz/2+1;
 
-    order = pme->pme_order;
-    thx   = spline->theta[XX];
-    thy   = spline->theta[YY];
-    thz   = spline->theta[ZZ];
-    dthx  = spline->dtheta[XX];
-    dthy  = spline->dtheta[YY];
-    dthz  = spline->dtheta[ZZ];
-    nx    = pme->nkx;
-    ny    = pme->nky;
-    nz    = pme->nkz;
-    pnx   = pme->pmegrid_nx;
-    pny   = pme->pmegrid_ny;
-    pnz   = pme->pmegrid_nz;
+    work  = &pme->work[thread];
+    mhx   = work->mhx;
+    mhy   = work->mhy;
+    mhz   = work->mhz;
+    m2    = work->m2;
+    denom = work->denom;
+    tmp1  = work->tmp1;
+    tmp2  = work->tmp2;
 
-    rxx   = pme->recipbox[XX][XX];
-    ryx   = pme->recipbox[YY][XX];
-    ryy   = pme->recipbox[YY][YY];
-    rzx   = pme->recipbox[ZZ][XX];
-    rzy   = pme->recipbox[ZZ][YY];
-    rzz   = pme->recipbox[ZZ][ZZ];
+    iyz0 = local_ndata[YY]*local_ndata[ZZ]* thread   /nthread;
+    iyz1 = local_ndata[YY]*local_ndata[ZZ]*(thread+1)/nthread;
 
-    for (nn = 0; nn < spline->n; nn++)
+    for (iyz = iyz0; iyz < iyz1; iyz++)
     {
-        n  = spline->ind[nn];
-        qn = scale*atc->q[n];
+        iy = iyz/local_ndata[ZZ];
+        iz = iyz - iy*local_ndata[ZZ];
 
-        if (bClearF)
+        ky = iy + local_offset[YY];
+
+        if (ky < maxky)
         {
-            atc->f[n][XX] = 0;
-            atc->f[n][YY] = 0;
-            atc->f[n][ZZ] = 0;
+            my = ky;
         }
-        if (qn != 0)
+        else
         {
-            fx     = 0;
-            fy     = 0;
-            fz     = 0;
-            idxptr = atc->idx[n];
-            norder = nn*order;
+            my = (ky - ny);
+        }
+
+        by = 3.0*vol*pme->bsp_mod[YY][ky]
+            / (M_PI*sqrt(M_PI)*ewaldcoeff*ewaldcoeff*ewaldcoeff);
+
+        kz = iz + local_offset[ZZ];
+
+        mz = kz;
+
+        bz = pme->bsp_mod[ZZ][kz];
+
+        /* 0.5 correction for corner points */
+        corner_fac = 1;
+        if (kz == 0 || kz == (nz+1)/2)
+        {
+            corner_fac = 0.5;
+        }
+
+        kxstart = local_offset[XX];
+        kxend   = local_offset[XX] + local_ndata[XX];
+        if (bEnerVir)
+        {
+            /* More expensive inner loop, especially because of the
+             * storage of the mh elements in array's.  Because x is the
+             * minor grid index, all mh elements depend on kx for
+             * triclinic unit cells.
+             */
+
+            /* Two explicit loops to avoid a conditional inside the loop */
+            for (kx = kxstart; kx < maxkx; kx++)
+            {
+                mx = kx;
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                mhx[kx]   = mhxk;
+                mhy[kx]   = mhyk;
+                mhz[kx]   = mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+                tmp2[kx]  = sqrt(factor*m2k);
+            }
+
+            for (kx = maxkx; kx < kxend; kx++)
+            {
+                mx = (kx - nx);
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                mhx[kx]   = mhxk;
+                mhy[kx]   = mhyk;
+                mhz[kx]   = mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+                tmp2[kx]  = sqrt(factor*m2k);
+            }
+
+            calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
+
+            for (kx = kxstart; kx < kxend; kx++)
+            {
+                m2k   = factor*m2[kx];
+                eterm = -((1.0 - 2.0*m2k)*tmp1[kx]
+                          + 2.0*m2k*tmp2[kx]);
+                vterm    = 3.0*(-tmp1[kx] + tmp2[kx]);
+                tmp1[kx] = eterm*denom[kx];
+                tmp2[kx] = vterm*denom[kx];
+            }
+
+            if (!bLB)
+            {
+                t_complex *p0;
+                real       struct2;
+
+                p0 = grid[0] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                for (kx = kxstart; kx < kxend; kx++, p0++)
+                {
+                    d1      = p0->re;
+                    d2      = p0->im;
+
+                    eterm   = tmp1[kx];
+                    vterm   = tmp2[kx];
+                    p0->re  = d1*eterm;
+                    p0->im  = d2*eterm;
+
+                    struct2 = 2.0*(d1*d1+d2*d2);
+
+                    tmp1[kx] = eterm*struct2;
+                    tmp2[kx] = vterm*struct2;
+                }
+            }
+            else
+            {
+                real *struct2 = denom;
+                real  str2;
+
+                for (kx = kxstart; kx < kxend; kx++)
+                {
+                    struct2[kx] = 0.0;
+                }
+                /* Due to symmetry we only need to calculate 4 of the 7 terms */
+                for (ig = 0; ig <= 3; ++ig)
+                {
+                    t_complex *p0, *p1;
+                    real       scale;
+
+                    p0    = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                    p1    = grid[6-ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                    scale = 2.0*lb_scale_factor_symm[ig];
+                    for (kx = kxstart; kx < kxend; ++kx, ++p0, ++p1)
+                    {
+                        struct2[kx] += scale*(p0->re*p1->re + p0->im*p1->im);
+                    }
+
+                }
+                for (ig = 0; ig <= 6; ++ig)
+                {
+                    t_complex *p0;
+
+                    p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                    for (kx = kxstart; kx < kxend; kx++, p0++)
+                    {
+                        d1     = p0->re;
+                        d2     = p0->im;
+
+                        eterm  = tmp1[kx];
+                        p0->re = d1*eterm;
+                        p0->im = d2*eterm;
+                    }
+                }
+                for (kx = kxstart; kx < kxend; kx++)
+                {
+                    eterm    = tmp1[kx];
+                    vterm    = tmp2[kx];
+                    str2     = struct2[kx];
+                    tmp1[kx] = eterm*str2;
+                    tmp2[kx] = vterm*str2;
+                }
+            }
+
+            for (kx = kxstart; kx < kxend; kx++)
+            {
+                ets2     = corner_fac*tmp1[kx];
+                vterm    = 2.0*factor*tmp2[kx];
+                energy  += ets2;
+                ets2vf   = corner_fac*vterm;
+                virxx   += ets2vf*mhx[kx]*mhx[kx] - ets2;
+                virxy   += ets2vf*mhx[kx]*mhy[kx];
+                virxz   += ets2vf*mhx[kx]*mhz[kx];
+                viryy   += ets2vf*mhy[kx]*mhy[kx] - ets2;
+                viryz   += ets2vf*mhy[kx]*mhz[kx];
+                virzz   += ets2vf*mhz[kx]*mhz[kx] - ets2;
+            }
+        }
+        else
+        {
+            /* We don't need to calculate the energy and the virial.
+             *  In this case the triclinic overhead is small.
+             */
+
+            /* Two explicit loops to avoid a conditional inside the loop */
+
+            for (kx = kxstart; kx < maxkx; kx++)
+            {
+                mx = kx;
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+                tmp2[kx]  = sqrt(factor*m2k);
+            }
+
+            for (kx = maxkx; kx < kxend; kx++)
+            {
+                mx = (kx - nx);
+
+                mhxk      = mx * rxx;
+                mhyk      = mx * ryx + my * ryy;
+                mhzk      = mx * rzx + my * rzy + mz * rzz;
+                m2k       = mhxk*mhxk + mhyk*mhyk + mhzk*mhzk;
+                m2[kx]    = m2k;
+                denom[kx] = bz*by*pme->bsp_mod[XX][kx];
+                tmp1[kx]  = -factor*m2k;
+                tmp2[kx]  = sqrt(factor*m2k);
+            }
+
+            calc_exponentials_lj(kxstart, kxend, tmp1, tmp2, denom);
+
+            for (kx = kxstart; kx < kxend; kx++)
+            {
+                m2k    = factor*m2[kx];
+                eterm  = -((1.0 - 2.0*m2k)*tmp1[kx]
+                           + 2.0*m2k*tmp2[kx]);
+                tmp1[kx] = eterm*denom[kx];
+            }
+            gcount = (bLB ? 7 : 1);
+            for (ig = 0; ig < gcount; ++ig)
+            {
+                t_complex *p0;
+
+                p0 = grid[ig] + iy*local_size[ZZ]*local_size[XX] + iz*local_size[XX];
+                for (kx = kxstart; kx < kxend; kx++, p0++)
+                {
+                    d1      = p0->re;
+                    d2      = p0->im;
+
+                    eterm   = tmp1[kx];
+
+                    p0->re  = d1*eterm;
+                    p0->im  = d2*eterm;
+                }
+            }
+        }
+    }
+    if (bEnerVir)
+    {
+        work->vir_lj[XX][XX] = 0.25*virxx;
+        work->vir_lj[YY][YY] = 0.25*viryy;
+        work->vir_lj[ZZ][ZZ] = 0.25*virzz;
+        work->vir_lj[XX][YY] = work->vir_lj[YY][XX] = 0.25*virxy;
+        work->vir_lj[XX][ZZ] = work->vir_lj[ZZ][XX] = 0.25*virxz;
+        work->vir_lj[YY][ZZ] = work->vir_lj[ZZ][YY] = 0.25*viryz;
+
+        /* This energy should be corrected for a charged system */
+        work->energy_lj = 0.5*energy;
+    }
+    /* Return the loop count */
+    return local_ndata[YY]*local_ndata[XX];
+}
+
+static void get_pme_ener_vir_q(const gmx_pme_t pme, int nthread,
+                               real *mesh_energy, matrix vir)
+{
+    /* This function sums output over threads and should therefore
+     * only be called after thread synchronization.
+     */
+    int thread;
+
+    *mesh_energy = pme->work[0].energy_q;
+    copy_mat(pme->work[0].vir_q, vir);
+
+    for (thread = 1; thread < nthread; thread++)
+    {
+        *mesh_energy += pme->work[thread].energy_q;
+        m_add(vir, pme->work[thread].vir_q, vir);
+    }
+}
+
+static void get_pme_ener_vir_lj(const gmx_pme_t pme, int nthread,
+                                real *mesh_energy, matrix vir)
+{
+    /* This function sums output over threads and should therefore
+     * only be called after thread synchronization.
+     */
+    int thread;
+
+    *mesh_energy = pme->work[0].energy_lj;
+    copy_mat(pme->work[0].vir_lj, vir);
+
+    for (thread = 1; thread < nthread; thread++)
+    {
+        *mesh_energy += pme->work[thread].energy_lj;
+        m_add(vir, pme->work[thread].vir_lj, vir);
+    }
+}
+
+
+#define DO_FSPLINE(order)                      \
+    for (ithx = 0; (ithx < order); ithx++)              \
+    {                                              \
+        index_x = (i0+ithx)*pny*pnz;               \
+        tx      = thx[ithx];                       \
+        dx      = dthx[ithx];                      \
+                                               \
+        for (ithy = 0; (ithy < order); ithy++)          \
+        {                                          \
+            index_xy = index_x+(j0+ithy)*pnz;      \
+            ty       = thy[ithy];                  \
+            dy       = dthy[ithy];                 \
+            fxy1     = fz1 = 0;                    \
+                                               \
+            for (ithz = 0; (ithz < order); ithz++)      \
+            {                                      \
+                gval  = grid[index_xy+(k0+ithz)];  \
+                fxy1 += thz[ithz]*gval;            \
+                fz1  += dthz[ithz]*gval;           \
+            }                                      \
+            fx += dx*ty*fxy1;                      \
+            fy += tx*dy*fxy1;                      \
+            fz += tx*ty*fz1;                       \
+        }                                          \
+    }
+
+
+static void gather_f_bsplines(gmx_pme_t pme, real *grid,
+                              gmx_bool bClearF, pme_atomcomm_t *atc,
+                              splinedata_t *spline,
+                              real scale)
+{
+    /* sum forces for local particles */
+    int     nn, n, ithx, ithy, ithz, i0, j0, k0;
+    int     index_x, index_xy;
+    int     nx, ny, nz, pnx, pny, pnz;
+    int *   idxptr;
+    real    tx, ty, dx, dy, coefficient;
+    real    fx, fy, fz, gval;
+    real    fxy1, fz1;
+    real    *thx, *thy, *thz, *dthx, *dthy, *dthz;
+    int     norder;
+    real    rxx, ryx, ryy, rzx, rzy, rzz;
+    int     order;
+
+    pme_spline_work_t *work;
+
+#if defined PME_SIMD4_SPREAD_GATHER && !defined PME_SIMD4_UNALIGNED
+    real           thz_buffer[GMX_SIMD4_WIDTH*3],  *thz_aligned;
+    real           dthz_buffer[GMX_SIMD4_WIDTH*3], *dthz_aligned;
+
+    thz_aligned  = gmx_simd4_align_r(thz_buffer);
+    dthz_aligned = gmx_simd4_align_r(dthz_buffer);
+#endif
+
+    work = pme->spline_work;
+
+    order = pme->pme_order;
+    thx   = spline->theta[XX];
+    thy   = spline->theta[YY];
+    thz   = spline->theta[ZZ];
+    dthx  = spline->dtheta[XX];
+    dthy  = spline->dtheta[YY];
+    dthz  = spline->dtheta[ZZ];
+    nx    = pme->nkx;
+    ny    = pme->nky;
+    nz    = pme->nkz;
+    pnx   = pme->pmegrid_nx;
+    pny   = pme->pmegrid_ny;
+    pnz   = pme->pmegrid_nz;
+
+    rxx   = pme->recipbox[XX][XX];
+    ryx   = pme->recipbox[YY][XX];
+    ryy   = pme->recipbox[YY][YY];
+    rzx   = pme->recipbox[ZZ][XX];
+    rzy   = pme->recipbox[ZZ][YY];
+    rzz   = pme->recipbox[ZZ][ZZ];
+
+    for (nn = 0; nn < spline->n; nn++)
+    {
+        n           = spline->ind[nn];
+        coefficient = scale*atc->coefficient[n];
+
+        if (bClearF)
+        {
+            atc->f[n][XX] = 0;
+            atc->f[n][YY] = 0;
+            atc->f[n][ZZ] = 0;
+        }
+        if (coefficient != 0)
+        {
+            fx     = 0;
+            fy     = 0;
+            fz     = 0;
+            idxptr = atc->idx[n];
+            norder = nn*order;
 
             i0   = idxptr[XX];
             j0   = idxptr[YY];
@@ -2302,23 +2592,23 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
             switch (order)
             {
                 case 4:
-#ifdef PME_SSE
-#ifdef PME_SSE_UNALIGNED
-#define PME_GATHER_F_SSE_ORDER4
+#ifdef PME_SIMD4_SPREAD_GATHER
+#ifdef PME_SIMD4_UNALIGNED
+#define PME_GATHER_F_SIMD4_ORDER4
 #else
-#define PME_GATHER_F_SSE_ALIGNED
+#define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_FSPLINE(4);
 #endif
                     break;
                 case 5:
-#ifdef PME_SSE
-#define PME_GATHER_F_SSE_ALIGNED
+#ifdef PME_SIMD4_SPREAD_GATHER
+#define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_sse_single.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_FSPLINE(5);
 #endif
@@ -2328,9 +2618,9 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
                     break;
             }
 
-            atc->f[n][XX] += -qn*( fx*nx*rxx );
-            atc->f[n][YY] += -qn*( fx*nx*ryx + fy*ny*ryy );
-            atc->f[n][ZZ] += -qn*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
+            atc->f[n][XX] += -coefficient*( fx*nx*rxx );
+            atc->f[n][YY] += -coefficient*( fx*nx*ryx + fy*ny*ryy );
+            atc->f[n][ZZ] += -coefficient*( fx*nx*rzx + fy*ny*rzy + fz*nz*rzz );
         }
     }
     /* Since the energy and not forces are interpolated
@@ -2352,7 +2642,7 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
     int     n, ithx, ithy, ithz, i0, j0, k0;
     int     index_x, index_xy;
     int *   idxptr;
-    real    energy, pot, tx, ty, qn, gval;
+    real    energy, pot, tx, ty, coefficient, gval;
     real    *thx, *thy, *thz;
     int     norder;
     int     order;
@@ -2364,9 +2654,9 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
     energy = 0;
     for (n = 0; (n < atc->n); n++)
     {
-        qn      = atc->q[n];
+        coefficient      = atc->coefficient[n];
 
-        if (qn != 0)
+        if (coefficient != 0)
         {
             idxptr = atc->idx[n];
             norder = n*order;
@@ -2400,7 +2690,7 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
                 }
             }
 
-            energy += pot*qn;
+            energy += pot*coefficient;
         }
     }
 
@@ -2462,8 +2752,8 @@ static real gather_energy_bsplines(gmx_pme_t pme, real *grid,
     }
 
 void make_bsplines(splinevec theta, splinevec dtheta, int order,
-                   rvec fractx[], int nr, int ind[], real charge[],
-                   gmx_bool bFreeEnergy)
+                   rvec fractx[], int nr, int ind[], real coefficient[],
+                   gmx_bool bDoSplines)
 {
     /* construct splines for local atoms */
     int  i, ii;
@@ -2471,12 +2761,12 @@ void make_bsplines(splinevec theta, splinevec dtheta, int order,
 
     for (i = 0; i < nr; i++)
     {
-        /* With free energy we do not use the charge check.
+        /* With free energy we do not use the coefficient check.
          * In most cases this will be more efficient than calling make_bsplines
-         * twice, since usually more than half the particles have charges.
+         * twice, since usually more than half the particles have non-zero coefficients.
          */
         ii = ind[i];
-        if (bFreeEnergy || charge[ii] != 0.0)
+        if (bDoSplines || coefficient[ii] != 0.0)
         {
             xptr = fractx[ii];
             switch (order)
@@ -2684,7 +2974,7 @@ static void setup_coordinate_communication(pme_atomcomm_t *atc)
 
 int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
 {
-    int thread;
+    int thread, i;
 
     if (NULL != log)
     {
@@ -2695,19 +2985,17 @@ int gmx_pme_destroy(FILE *log, gmx_pme_t *pmedata)
     sfree((*pmedata)->nny);
     sfree((*pmedata)->nnz);
 
-    pmegrids_destroy(&(*pmedata)->pmegridA);
-
-    sfree((*pmedata)->fftgridA);
-    sfree((*pmedata)->cfftgridA);
-    gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupA);
-
-    if ((*pmedata)->pmegridB.grid.grid != NULL)
+    for (i = 0; i < (*pmedata)->ngrids; ++i)
     {
-        pmegrids_destroy(&(*pmedata)->pmegridB);
-        sfree((*pmedata)->fftgridB);
-        sfree((*pmedata)->cfftgridB);
-        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setupB);
+        pmegrids_destroy(&(*pmedata)->pmegrid[i]);
+        sfree((*pmedata)->fftgrid[i]);
+        sfree((*pmedata)->cfftgrid[i]);
+        gmx_parallel_3dfft_destroy((*pmedata)->pfft_setup[i]);
     }
+
+    sfree((*pmedata)->lb_buf1);
+    sfree((*pmedata)->lb_buf2);
+
     for (thread = 0; thread < (*pmedata)->nthread; thread++)
     {
         free_work(&(*pmedata)->work[thread]);
@@ -2743,7 +3031,7 @@ static double pme_load_imbalance(gmx_pme_t pme)
     return (n1 + n2 + 3*n3)/(double)(6*pme->nkx*pme->nky*pme->nkz);
 }
 
-static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
+static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc,
                           int dimind, gmx_bool bSpread)
 {
     int nk, k, s, thread;
@@ -2770,7 +3058,6 @@ static void init_atomcomm(gmx_pme_t pme, pme_atomcomm_t *atc, t_commrec *cr,
 
     if (atc->nslab > 1)
     {
-        /* These three allocations are not required for particle decomp. */
         snew(atc->node_dest, atc->nslab);
         snew(atc->node_src, atc->nslab);
         setup_coordinate_communication(atc);
@@ -2846,8 +3133,9 @@ init_overlap_comm(pme_overlap_t *  ol,
     {
         /* s2g0 the local interpolation grid start.
          * s2g1 the local interpolation grid end.
-         * Because grid overlap communication only goes forward,
-         * the grid the slabs for fft's should be rounded down.
+         * Since in calc_pidx we divide particles, and not grid lines,
+         * spatially uniform along dimension x or y, we need to round
+         * s2g0 down and s2g1 up.
          */
         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
@@ -2988,33 +3276,36 @@ make_gridindex5_to_localindex(int n, int local_start, int local_range,
     *fraction_shift  = fsh;
 }
 
-static pme_spline_work_t *make_pme_spline_work(int order)
+static pme_spline_work_t *make_pme_spline_work(int gmx_unused order)
 {
     pme_spline_work_t *work;
 
-#ifdef PME_SSE
-    float  tmp[8];
-    __m128 zero_SSE;
-    int    of, i;
+#ifdef PME_SIMD4_SPREAD_GATHER
+    real             tmp[GMX_SIMD4_WIDTH*3], *tmp_aligned;
+    gmx_simd4_real_t zero_S;
+    gmx_simd4_real_t real_mask_S0, real_mask_S1;
+    int              of, i;
+
+    snew_aligned(work, 1, SIMD4_ALIGNMENT);
 
-    snew_aligned(work, 1, 16);
+    tmp_aligned = gmx_simd4_align_r(tmp);
 
-    zero_SSE = _mm_setzero_ps();
+    zero_S = gmx_simd4_setzero_r();
 
     /* Generate bit masks to mask out the unused grid entries,
      * as we only operate on order of the 8 grid entries that are
-     * load into 2 SSE float registers.
+     * load into 2 SIMD registers.
      */
-    for (of = 0; of < 8-(order-1); of++)
+    for (of = 0; of < 2*GMX_SIMD4_WIDTH-(order-1); of++)
     {
-        for (i = 0; i < 8; i++)
+        for (i = 0; i < 2*GMX_SIMD4_WIDTH; i++)
         {
-            tmp[i] = (i >= of && i < of+order ? 1 : 0);
+            tmp_aligned[i] = (i >= of && i < of+order ? -1.0 : 1.0);
         }
-        work->mask_SSE0[of] = _mm_loadu_ps(tmp);
-        work->mask_SSE1[of] = _mm_loadu_ps(tmp+4);
-        work->mask_SSE0[of] = _mm_cmpgt_ps(work->mask_SSE0[of], zero_SSE);
-        work->mask_SSE1[of] = _mm_cmpgt_ps(work->mask_SSE1[of], zero_SSE);
+        real_mask_S0      = gmx_simd4_load_r(tmp_aligned);
+        real_mask_S1      = gmx_simd4_load_r(tmp_aligned+GMX_SIMD4_WIDTH);
+        work->mask_S0[of] = gmx_simd4_cmplt_r(real_mask_S0, zero_S);
+        work->mask_S1[of] = gmx_simd4_cmplt_r(real_mask_S1, zero_S);
     }
 #else
     work = NULL;
@@ -3023,29 +3314,59 @@ static pme_spline_work_t *make_pme_spline_work(int order)
     return work;
 }
 
-static void
-gmx_pme_check_grid_restrictions(FILE *fplog, char dim, int nnodes, int *nk)
+void gmx_pme_check_restrictions(int pme_order,
+                                int nkx, int nky, int nkz,
+                                int nnodes_major,
+                                int nnodes_minor,
+                                gmx_bool bUseThreads,
+                                gmx_bool bFatal,
+                                gmx_bool *bValidSettings)
 {
-    int nk_new;
-
-    if (*nk % nnodes != 0)
+    if (pme_order > PME_ORDER_MAX)
     {
-        nk_new = nnodes*(*nk/nnodes + 1);
+        if (!bFatal)
+        {
+            *bValidSettings = FALSE;
+            return;
+        }
+        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
+                  pme_order, PME_ORDER_MAX);
+    }
 
-        if (2*nk_new >= 3*(*nk))
+    if (nkx <= pme_order*(nnodes_major > 1 ? 2 : 1) ||
+        nky <= pme_order*(nnodes_minor > 1 ? 2 : 1) ||
+        nkz <= pme_order)
+    {
+        if (!bFatal)
         {
-            gmx_fatal(FARGS, "The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). The grid size would have to be increased by more than 50%% to make the grid divisible. Change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).",
-                      dim, *nk, dim, nnodes, dim);
+            *bValidSettings = FALSE;
+            return;
         }
+        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order",
+                  pme_order);
+    }
 
-        if (fplog != NULL)
+    /* Check for a limitation of the (current) sum_fftgrid_dd code.
+     * We only allow multiple communication pulses in dim 1, not in dim 0.
+     */
+    if (bUseThreads && (nkx < nnodes_major*pme_order &&
+                        nkx != nnodes_major*(pme_order - 1)))
+    {
+        if (!bFatal)
         {
-            fprintf(fplog, "\nNOTE: The PME grid size in dim %c (%d) is not divisble by the number of nodes doing PME in dim %c (%d). Increasing the PME grid size in dim %c to %d. This will increase the accuracy and will not decrease the performance significantly on this number of nodes. For optimal performance change the total number of nodes or the number of domain decomposition cells in x or the PME grid %c dimension (and the cut-off).\n\n",
-                    dim, *nk, dim, nnodes, dim, nk_new, dim);
+            *bValidSettings = FALSE;
+            return;
         }
+        gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
+                  nkx/(double)nnodes_major, pme_order);
+    }
 
-        *nk = nk_new;
+    if (bValidSettings != NULL)
+    {
+        *bValidSettings = TRUE;
     }
+
+    return;
 }
 
 int gmx_pme_init(gmx_pme_t *         pmedata,
@@ -3054,13 +3375,14 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                  int                 nnodes_minor,
                  t_inputrec *        ir,
                  int                 homenr,
-                 gmx_bool            bFreeEnergy,
+                 gmx_bool            bFreeEnergy_q,
+                 gmx_bool            bFreeEnergy_lj,
                  gmx_bool            bReproducible,
                  int                 nthread)
 {
     gmx_pme_t pme = NULL;
 
-    pme_atomcomm_t *atc;
+    int  use_threads, sum_use_threads, i;
     ivec ndata;
 
     if (debug)
@@ -3069,11 +3391,9 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     }
     snew(pme, 1);
 
-    pme->redist_init         = FALSE;
     pme->sum_qgrid_tmp       = NULL;
     pme->sum_qgrid_dd_tmp    = NULL;
     pme->buf_nalloc          = 0;
-    pme->redist_buf_nalloc   = 0;
 
     pme->nnodes              = 1;
     pme->bPPnode             = TRUE;
@@ -3090,7 +3410,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
         if (pme->nnodes != nnodes_major*nnodes_minor)
         {
-            gmx_incons("PME node count mismatch");
+            gmx_incons("PME rank count mismatch");
         }
     }
     else
@@ -3139,7 +3459,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         {
             if (pme->nnodes % nnodes_major != 0)
             {
-                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+                gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
             }
             pme->ndecompdim = 2;
 
@@ -3160,50 +3480,49 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
 
     pme->nthread = nthread;
 
+    /* Check if any of the PME MPI ranks uses threads */
+    use_threads = (pme->nthread > 1 ? 1 : 0);
+#ifdef GMX_MPI
+    if (pme->nnodes > 1)
+    {
+        MPI_Allreduce(&use_threads, &sum_use_threads, 1, MPI_INT,
+                      MPI_SUM, pme->mpi_comm);
+    }
+    else
+#endif
+    {
+        sum_use_threads = use_threads;
+    }
+    pme->bUseThreads = (sum_use_threads > 0);
+
     if (ir->ePBC == epbcSCREW)
     {
         gmx_fatal(FARGS, "pme does not (yet) work with pbc = screw");
     }
 
-    pme->bFEP        = ((ir->efep != efepNO) && bFreeEnergy);
+    pme->bFEP_q      = ((ir->efep != efepNO) && bFreeEnergy_q);
+    pme->bFEP_lj     = ((ir->efep != efepNO) && bFreeEnergy_lj);
+    pme->bFEP        = (pme->bFEP_q || pme->bFEP_lj);
     pme->nkx         = ir->nkx;
     pme->nky         = ir->nky;
     pme->nkz         = ir->nkz;
     pme->bP3M        = (ir->coulombtype == eelP3M_AD || getenv("GMX_PME_P3M") != NULL);
     pme->pme_order   = ir->pme_order;
-    pme->epsilon_r   = ir->epsilon_r;
-
-    if (pme->pme_order > PME_ORDER_MAX)
-    {
-        gmx_fatal(FARGS, "pme_order (%d) is larger than the maximum allowed value (%d). Modify and recompile the code if you really need such a high order.",
-                  pme->pme_order, PME_ORDER_MAX);
-    }
 
-    /* Currently pme.c supports only the fft5d FFT code.
-     * Therefore the grid always needs to be divisible by nnodes.
-     * When the old 1D code is also supported again, change this check.
-     *
-     * This check should be done before calling gmx_pme_init
-     * and fplog should be passed iso stderr.
-     *
-       if (pme->ndecompdim >= 2)
-     */
-    if (pme->ndecompdim >= 1)
-    {
-        /*
-           gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
-                                        'x',nnodes_major,&pme->nkx);
-           gmx_pme_check_grid_restrictions(pme->nodeid==0 ? stderr : NULL,
-                                        'y',nnodes_minor,&pme->nky);
-         */
-    }
+    /* Always constant electrostatics coefficients */
+    pme->epsilon_r   = ir->epsilon_r;
 
-    if (pme->nkx <= pme->pme_order*(pme->nnodes_major > 1 ? 2 : 1) ||
-        pme->nky <= pme->pme_order*(pme->nnodes_minor > 1 ? 2 : 1) ||
-        pme->nkz <= pme->pme_order)
-    {
-        gmx_fatal(FARGS, "The PME grid sizes need to be larger than pme_order (%d) and for dimensions with domain decomposition larger than 2*pme_order", pme->pme_order);
-    }
+    /* Always constant LJ coefficients */
+    pme->ljpme_combination_rule = ir->ljpme_combination_rule;
+
+    /* If we violate restrictions, generate a fatal error here */
+    gmx_pme_check_restrictions(pme->pme_order,
+                               pme->nkx, pme->nky, pme->nkz,
+                               pme->nnodes_major,
+                               pme->nnodes_minor,
+                               pme->bUseThreads,
+                               TRUE,
+                               NULL);
 
     if (pme->nnodes > 1)
     {
@@ -3214,10 +3533,10 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         MPI_Type_commit(&(pme->rvec_mpi));
 #endif
 
-        /* Note that the charge spreading and force gathering, which usually
+        /* Note that the coefficient spreading and force gathering, which usually
          * takes about the same amount of time as FFT+solve_pme,
          * is always fully load balanced
-         * (unless the charge distribution is inhomogeneous).
+         * (unless the coefficient distribution is inhomogeneous).
          */
 
         imbal = pme_load_imbalance(pme);
@@ -3227,8 +3546,8 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                     "\n"
                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
                     "      For optimal PME load balancing\n"
-                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
-                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
+                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
                     "\n",
                     (int)((imbal-1)*100 + 0.5),
                     pme->nkx, pme->nky, pme->nnodes_major,
@@ -3261,14 +3580,12 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                       pme->nky,
                       (div_round_up(pme->nkx, pme->nnodes_major)+pme->pme_order+1)*pme->nkz);
 
-    /* Check for a limitation of the (current) sum_fftgrid_dd code.
-     * We only allow multiple communication pulses in dim 1, not in dim 0.
+    /* Double-check for a limitation of the (current) sum_fftgrid_dd code.
+     * Note that gmx_pme_check_restrictions checked for this already.
      */
-    if (pme->nthread > 1 && (pme->overlap[0].noverlap_nodes > 1 ||
-                             pme->nkx < pme->nnodes_major*pme->pme_order))
+    if (pme->bUseThreads && pme->overlap[0].noverlap_nodes > 1)
     {
-        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x and should be >= pme_order (%d). To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
-                  pme->nkx/(double)pme->nnodes_major, pme->pme_order);
+        gmx_incons("More than one communication pulse required for grid overlap communication along the major dimension while using threads");
     }
 
     snew(pme->bsp_mod[XX], pme->nkx);
@@ -3303,48 +3620,49 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                                   pme->pmegrid_nz_base,
                                   &pme->nnz, &pme->fshz);
 
-    pmegrids_init(&pme->pmegridA,
-                  pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
-                  pme->pmegrid_nz_base,
-                  pme->pme_order,
-                  pme->nthread,
-                  pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
-                  pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
-
     pme->spline_work = make_pme_spline_work(pme->pme_order);
 
-    ndata[0] = pme->nkx;
-    ndata[1] = pme->nky;
-    ndata[2] = pme->nkz;
-
-    /* This routine will allocate the grid data to fit the FFTs */
-    gmx_parallel_3dfft_init(&pme->pfft_setupA, ndata,
-                            &pme->fftgridA, &pme->cfftgridA,
-                            pme->mpi_comm_d,
-                            pme->overlap[0].s2g0, pme->overlap[1].s2g0,
-                            bReproducible, pme->nthread);
-
-    if (bFreeEnergy)
-    {
-        pmegrids_init(&pme->pmegridB,
-                      pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
-                      pme->pmegrid_nz_base,
-                      pme->pme_order,
-                      pme->nthread,
-                      pme->nkx % pme->nnodes_major != 0,
-                      pme->nky % pme->nnodes_minor != 0);
-
-        gmx_parallel_3dfft_init(&pme->pfft_setupB, ndata,
-                                &pme->fftgridB, &pme->cfftgridB,
-                                pme->mpi_comm_d,
-                                pme->overlap[0].s2g0, pme->overlap[1].s2g0,
-                                bReproducible, pme->nthread);
+    ndata[0]    = pme->nkx;
+    ndata[1]    = pme->nky;
+    ndata[2]    = pme->nkz;
+    /* It doesn't matter if we allocate too many grids here,
+     * we only allocate and use the ones we need.
+     */
+    if (EVDW_PME(ir->vdwtype))
+    {
+        pme->ngrids = ((ir->ljpme_combination_rule == eljpmeLB) ? DO_Q_AND_LJ_LB : DO_Q_AND_LJ);
     }
     else
     {
-        pme->pmegridB.grid.grid = NULL;
-        pme->fftgridB           = NULL;
-        pme->cfftgridB          = NULL;
+        pme->ngrids = DO_Q;
+    }
+    snew(pme->fftgrid, pme->ngrids);
+    snew(pme->cfftgrid, pme->ngrids);
+    snew(pme->pfft_setup, pme->ngrids);
+
+    for (i = 0; i < pme->ngrids; ++i)
+    {
+        if ((i <  DO_Q && EEL_PME(ir->coulombtype) && (i == 0 ||
+                                                       bFreeEnergy_q)) ||
+            (i >= DO_Q && EVDW_PME(ir->vdwtype) && (i == 2 ||
+                                                    bFreeEnergy_lj ||
+                                                    ir->ljpme_combination_rule == eljpmeLB)))
+        {
+            pmegrids_init(&pme->pmegrid[i],
+                          pme->pmegrid_nx, pme->pmegrid_ny, pme->pmegrid_nz,
+                          pme->pmegrid_nz_base,
+                          pme->pme_order,
+                          pme->bUseThreads,
+                          pme->nthread,
+                          pme->overlap[0].s2g1[pme->nodeid_major]-pme->overlap[0].s2g0[pme->nodeid_major+1],
+                          pme->overlap[1].s2g1[pme->nodeid_minor]-pme->overlap[1].s2g0[pme->nodeid_minor+1]);
+            /* This routine will allocate the grid data to fit the FFTs */
+            gmx_parallel_3dfft_init(&pme->pfft_setup[i], ndata,
+                                    &pme->fftgrid[i], &pme->cfftgrid[i],
+                                    pme->mpi_comm_d,
+                                    bReproducible, pme->nthread);
+
+        }
     }
 
     if (!pme->bP3M)
@@ -3359,10 +3677,10 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
     }
 
     /* Use atc[0] for spreading */
-    init_atomcomm(pme, &pme->atc[0], cr, nnodes_major > 1 ? 0 : 1, TRUE);
+    init_atomcomm(pme, &pme->atc[0], nnodes_major > 1 ? 0 : 1, TRUE);
     if (pme->ndecompdim >= 2)
     {
-        init_atomcomm(pme, &pme->atc[1], cr, 1, FALSE);
+        init_atomcomm(pme, &pme->atc[1], 1, FALSE);
     }
 
     if (pme->nnodes == 1)
@@ -3371,6 +3689,10 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         pme_realloc_atomcomm_things(&pme->atc[0]);
     }
 
+    pme->lb_buf1       = NULL;
+    pme->lb_buf2       = NULL;
+    pme->lb_buf_nalloc = 0;
+
     {
         int thread;
 
@@ -3403,7 +3725,7 @@ static void reuse_pmegrids(const pmegrids_t *old, pmegrids_t *new)
     sfree_aligned(new->grid.grid);
     new->grid.grid = old->grid.grid;
 
-    if (new->nthread > 1 && new->nthread == old->nthread)
+    if (new->grid_th != NULL && new->nthread == old->nthread)
     {
         sfree_aligned(new->grid_all);
         for (t = 0; t < new->nthread; t++)
@@ -3438,12 +3760,12 @@ int gmx_pme_reinit(gmx_pme_t *         pmedata,
     }
 
     ret = gmx_pme_init(pmedata, cr, pme_src->nnodes_major, pme_src->nnodes_minor,
-                       &irc, homenr, pme_src->bFEP, FALSE, pme_src->nthread);
+                       &irc, homenr, pme_src->bFEP_q, pme_src->bFEP_lj, FALSE, pme_src->nthread);
 
     if (ret == 0)
     {
         /* We can easily reuse the allocated pme grids in pme_src */
-        reuse_pmegrids(&pme_src->pmegridA, &(*pmedata)->pmegridA);
+        reuse_pmegrids(&pme_src->pmegrid[PME_GRID_QA], &(*pmedata)->pmegrid[PME_GRID_QA]);
         /* We would like to reuse the fft grids, but that's harder */
     }
 
@@ -3451,8 +3773,8 @@ int gmx_pme_reinit(gmx_pme_t *         pmedata,
 }
 
 
-static void copy_local_grid(gmx_pme_t pme,
-                            pmegrids_t *pmegrids, int thread, real *fftgrid)
+static void copy_local_grid(gmx_pme_t pme, pmegrids_t *pmegrids,
+                            int grid_index, int thread, real *fftgrid)
 {
     ivec local_fft_ndata, local_fft_offset, local_fft_size;
     int  fft_my, fft_mz;
@@ -3463,7 +3785,7 @@ static void copy_local_grid(gmx_pme_t pme,
     pmegrid_t *pmegrid;
     real *grid_th;
 
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -3507,14 +3829,15 @@ static void copy_local_grid(gmx_pme_t pme,
 static void
 reduce_threadgrid_overlap(gmx_pme_t pme,
                           const pmegrids_t *pmegrids, int thread,
-                          real *fftgrid, real *commbuf_x, real *commbuf_y)
+                          real *fftgrid, real *commbuf_x, real *commbuf_y,
+                          int grid_index)
 {
     ivec local_fft_ndata, local_fft_offset, local_fft_size;
     int  fft_nx, fft_ny, fft_nz;
     int  fft_my, fft_mz;
     int  buf_my = -1;
     int  nsx, nsy, nsz;
-    ivec ne;
+    ivec localcopy_end;
     int  offx, offy, offz, x, y, z, i0, i0t;
     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
@@ -3525,7 +3848,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
     const real *grid_th;
     real *commbuf = NULL;
 
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -3550,8 +3873,14 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 
     for (d = 0; d < DIM; d++)
     {
-        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
-                    local_fft_ndata[d]);
+        /* Determine up to where our thread needs to copy from the
+         * thread-local charge spreading grid to the rank-local FFT grid.
+         * This is up to our spreading grid end minus order-1 and
+         * not beyond the local FFT grid.
+         */
+        localcopy_end[d] =
+            min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+                local_fft_ndata[d]);
     }
 
     offx = pmegrid->offset[XX];
@@ -3566,7 +3895,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
     /* Now loop over all the thread data blocks that contribute
      * to the grid region we (our thread) are operating on.
      */
-    /* Note that ffy_nx/y is equal to the number of grid points
+    /* Note that fft_nx/y is equal to the number of grid points
      * between the first point of our node grid and the one of the next node.
      */
     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
@@ -3582,12 +3911,15 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
         }
         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
         ox       += pmegrid_g->offset[XX];
+        /* Determine the end of our part of the source grid */
         if (!bCommX)
         {
-            tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+            /* Use our thread local source grid and target grid part */
+            tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
         }
         else
         {
+            /* Use our thread local source grid and the spreading range */
             tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
         }
 
@@ -3604,12 +3936,15 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
             }
             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
             oy       += pmegrid_g->offset[YY];
+            /* Determine the end of our part of the source grid */
             if (!bCommY)
             {
-                ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+                /* Use our thread local source grid and target grid part */
+                ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
             }
             else
             {
+                /* Use our thread local source grid and the spreading range */
                 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
             }
 
@@ -3624,7 +3959,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                 }
                 pmegrid_g = &pmegrids->grid_th[fz];
                 oz       += pmegrid_g->offset[ZZ];
-                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+                tz1       = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
 
                 if (sx == 0 && sy == 0 && sz == 0)
                 {
@@ -3680,7 +4015,13 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                     if (bCommY)
                     {
                         commbuf = commbuf_y;
-                        buf_my  = ty1 - offy;
+                        /* The y-size of the communication buffer is set by
+                         * the overlap of the grid part of our local slab
+                         * with the part starting at the next slab.
+                         */
+                        buf_my  =
+                            pme->overlap[1].s2g1[pme->nodeid_minor] -
+                            pme->overlap[1].s2g0[pme->nodeid_minor+1];
                         if (bCommX)
                         {
                             /* We index commbuf modulo the local grid size */
@@ -3735,7 +4076,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 }
 
 
-static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
+static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
 {
     ivec local_fft_ndata, local_fft_offset, local_fft_size;
     pme_overlap_t *overlap;
@@ -3750,13 +4091,13 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
     int  x, y, z, indg, indb;
 
     /* Note that this routine is only used for forward communication.
-     * Since the force gathering, unlike the charge spreading,
+     * Since the force gathering, unlike the coefficient spreading,
      * can be trivially parallelized over the particles,
      * the backwards process is much simpler and can use the "old"
      * communication setup.
      */
 
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[grid_index],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -3793,6 +4134,12 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
             recvptr = overlap->recvbuf;
 
+            if (debug != NULL)
+            {
+                fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+                        local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+            }
+
 #ifdef GMX_MPI
             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
                          send_id, ipulse,
@@ -3859,7 +4206,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
 
         if (debug != NULL)
         {
-            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
         }
 
@@ -3890,7 +4237,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid)
 static void spread_on_grid(gmx_pme_t pme,
                            pme_atomcomm_t *atc, pmegrids_t *grids,
                            gmx_bool bCalcSplines, gmx_bool bSpread,
-                           real *fftgrid)
+                           real *fftgrid, gmx_bool bDoSplines, int grid_index)
 {
     int nthread, thread;
 #ifdef PME_TIME_THREADS
@@ -3919,7 +4266,7 @@ static void spread_on_grid(gmx_pme_t pme,
             /* Compute fftgrid index for all atoms,
              * with help of some extra variables.
              */
-            calc_interpolation_idx(pme, atc, start, end, thread);
+            calc_interpolation_idx(pme, atc, start, grid_index, end, thread);
         }
     }
 #ifdef PME_TIME_THREADS
@@ -3934,22 +4281,34 @@ static void spread_on_grid(gmx_pme_t pme,
     for (thread = 0; thread < nthread; thread++)
     {
         splinedata_t *spline;
-        pmegrid_t *grid;
+        pmegrid_t *grid = NULL;
 
         /* make local bsplines  */
-        if (grids == NULL || grids->nthread == 1)
+        if (grids == NULL || !pme->bUseThreads)
         {
             spline = &atc->spline[0];
 
             spline->n = atc->n;
 
-            grid = &grids->grid;
+            if (bSpread)
+            {
+                grid = &grids->grid;
+            }
         }
         else
         {
             spline = &atc->spline[thread];
 
-            make_thread_local_ind(atc, thread, spline);
+            if (grids->nthread == 1)
+            {
+                /* One thread, we operate on all coefficients */
+                spline->n = atc->n;
+            }
+            else
+            {
+                /* Get the indices our thread should operate on */
+                make_thread_local_ind(atc, thread, spline);
+            }
 
             grid = &grids->grid_th[thread];
         }
@@ -3957,7 +4316,7 @@ static void spread_on_grid(gmx_pme_t pme,
         if (bCalcSplines)
         {
             make_bsplines(spline->theta, spline->dtheta, pme->pme_order,
-                          atc->fractx, spline->n, spline->ind, atc->q, pme->bFEP);
+                          atc->fractx, spline->n, spline->ind, atc->coefficient, bDoSplines);
         }
 
         if (bSpread)
@@ -3966,11 +4325,11 @@ static void spread_on_grid(gmx_pme_t pme,
 #ifdef PME_TIME_SPREAD
             ct1a = omp_cyc_start();
 #endif
-            spread_q_bsplines_thread(grid, atc, spline, pme->spline_work);
+            spread_coefficients_bsplines_thread(grid, atc, spline, pme->spline_work);
 
-            if (grids->nthread > 1)
+            if (pme->bUseThreads)
             {
-                copy_local_grid(pme, grids, thread, fftgrid);
+                copy_local_grid(pme, grids, grid_index, thread, fftgrid);
             }
 #ifdef PME_TIME_SPREAD
             ct1a          = omp_cyc_end(ct1a);
@@ -3983,7 +4342,7 @@ static void spread_on_grid(gmx_pme_t pme,
     cs2 += (double)c2;
 #endif
 
-    if (bSpread && grids->nthread > 1)
+    if (bSpread && pme->bUseThreads)
     {
 #ifdef PME_TIME_THREADS
         c3 = omp_cyc_start();
@@ -3994,7 +4353,8 @@ static void spread_on_grid(gmx_pme_t pme,
             reduce_threadgrid_overlap(pme, grids, thread,
                                       fftgrid,
                                       pme->overlap[0].sendbuf,
-                                      pme->overlap[1].sendbuf);
+                                      pme->overlap[1].sendbuf,
+                                      grid_index);
         }
 #ifdef PME_TIME_THREADS
         c3   = omp_cyc_end(c3);
@@ -4003,8 +4363,11 @@ static void spread_on_grid(gmx_pme_t pme,
 
         if (pme->nnodes > 1)
         {
-            /* Communicate the overlapping part of the fftgrid */
-            sum_fftgrid_dd(pme, fftgrid);
+            /* Communicate the overlapping part of the fftgrid.
+             * For this communication call we need to check pme->bUseThreads
+             * to have all ranks communicate here, regardless of pme->nthread.
+             */
+            sum_fftgrid_dd(pme, fftgrid, grid_index);
         }
     }
 
@@ -4049,7 +4412,7 @@ static void dump_local_fftgrid(gmx_pme_t pme, const real *fftgrid)
 {
     ivec local_fft_ndata, local_fft_offset, local_fft_size;
 
-    gmx_parallel_3dfft_real_limits(pme->pfft_setupA,
+    gmx_parallel_3dfft_real_limits(pme->pfft_setup[PME_GRID_QA],
                                    local_fft_ndata,
                                    local_fft_offset,
                                    local_fft_size);
@@ -4076,7 +4439,7 @@ void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
     {
         gmx_incons("gmx_pme_calc_energy called in parallel");
     }
-    if (pme->bFEP > 1)
+    if (pme->bFEP_q > 1)
     {
         gmx_incons("gmx_pme_calc_energy with free energy");
     }
@@ -4092,21 +4455,23 @@ void gmx_pme_calc_energy(gmx_pme_t pme, int n, rvec *x, real *q, real *V)
     atc->pme_order = pme->pme_order;
     atc->n         = n;
     pme_realloc_atomcomm_things(atc);
-    atc->x         = x;
-    atc->q         = q;
+    atc->x           = x;
+    atc->coefficient = q;
 
     /* We only use the A-charges grid */
-    grid = &pme->pmegridA;
+    grid = &pme->pmegrid[PME_GRID_QA];
 
-    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgridA);
+    /* Only calculate the spline coefficients, don't actually spread */
+    spread_on_grid(pme, atc, NULL, TRUE, FALSE, pme->fftgrid[PME_GRID_QA], FALSE, PME_GRID_QA);
 
     *V = gather_energy_bsplines(pme, grid->grid.grid, atc);
 }
 
 
-static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
+static void reset_pmeonly_counters(gmx_wallcycle_t wcycle,
+                                   gmx_walltime_accounting_t walltime_accounting,
                                    t_nrnb *nrnb, t_inputrec *ir,
-                                   gmx_large_int_t step)
+                                   gmx_int64_t step)
 {
     /* Reset all the counters related to performance over the run */
     wallcycle_stop(wcycle, ewcRUN);
@@ -4119,6 +4484,7 @@ static void reset_pmeonly_counters(t_commrec *cr, gmx_wallcycle_t wcycle,
     }
     ir->init_step = step;
     wallcycle_start(wcycle, ewcRUN);
+    walltime_accounting_start(walltime_accounting);
 }
 
 
@@ -4155,11 +4521,11 @@ static void gmx_pmeonly_switch(int *npmedata, gmx_pme_t **pmedata,
     *pme_ret = (*pmedata)[ind];
 }
 
-
 int gmx_pmeonly(gmx_pme_t pme,
-                t_commrec *cr,    t_nrnb *nrnb,
+                t_commrec *cr,    t_nrnb *mynrnb,
                 gmx_wallcycle_t wcycle,
-                real ewaldcoeff,  gmx_bool bGatherOnly,
+                gmx_walltime_accounting_t walltime_accounting,
+                real ewaldcoeff_q, real ewaldcoeff_lj,
                 t_inputrec *ir)
 {
     int npmedata;
@@ -4170,14 +4536,18 @@ int gmx_pmeonly(gmx_pme_t pme,
     matrix box;
     rvec *x_pp      = NULL, *f_pp = NULL;
     real *chargeA   = NULL, *chargeB = NULL;
-    real lambda     = 0;
+    real *c6A       = NULL, *c6B = NULL;
+    real *sigmaA    = NULL, *sigmaB = NULL;
+    real lambda_q   = 0;
+    real lambda_lj  = 0;
     int  maxshift_x = 0, maxshift_y = 0;
-    real energy, dvdlambda;
-    matrix vir;
+    real energy_q, energy_lj, dvdlambda_q, dvdlambda_lj;
+    matrix vir_q, vir_lj;
     float cycles;
     int  count;
     gmx_bool bEnerVir;
-    gmx_large_int_t step, step_rel;
+    int pme_flags;
+    gmx_int64_t step, step_rel;
     ivec grid_switch;
 
     /* This data will only use with PME tuning, i.e. switching PME grids */
@@ -4187,7 +4557,7 @@ int gmx_pmeonly(gmx_pme_t pme,
 
     pme_pp = gmx_pme_pp_init(cr);
 
-    init_nrnb(nrnb);
+    init_nrnb(mynrnb);
 
     count = 0;
     do /****** this is a quasi-loop over time steps! */
@@ -4196,14 +4566,19 @@ int gmx_pmeonly(gmx_pme_t pme,
         do
         {
             /* Domain decomposition */
-            ret = gmx_pme_recv_q_x(pme_pp,
-                                   &natoms,
-                                   &chargeA, &chargeB, box, &x_pp, &f_pp,
-                                   &maxshift_x, &maxshift_y,
-                                   &pme->bFEP, &lambda,
-                                   &bEnerVir,
-                                   &step,
-                                   grid_switch, &ewaldcoeff);
+            ret = gmx_pme_recv_coeffs_coords(pme_pp,
+                                             &natoms,
+                                             &chargeA, &chargeB,
+                                             &c6A, &c6B,
+                                             &sigmaA, &sigmaB,
+                                             box, &x_pp, &f_pp,
+                                             &maxshift_x, &maxshift_y,
+                                             &pme->bFEP_q, &pme->bFEP_lj,
+                                             &lambda_q, &lambda_lj,
+                                             &bEnerVir,
+                                             &pme_flags,
+                                             &step,
+                                             grid_switch, &ewaldcoeff_q, &ewaldcoeff_lj);
 
             if (ret == pmerecvqxSWITCHGRID)
             {
@@ -4214,7 +4589,7 @@ int gmx_pmeonly(gmx_pme_t pme,
             if (ret == pmerecvqxRESETCOUNTERS)
             {
                 /* Reset the cycle and flop counters */
-                reset_pmeonly_counters(cr, wcycle, nrnb, ir, step);
+                reset_pmeonly_counters(wcycle, walltime_accounting, mynrnb, ir, step);
             }
         }
         while (ret == pmerecvqxSWITCHGRID || ret == pmerecvqxRESETCOUNTERS);
@@ -4230,42 +4605,125 @@ int gmx_pmeonly(gmx_pme_t pme,
         if (count == 0)
         {
             wallcycle_start(wcycle, ewcRUN);
+            walltime_accounting_start(walltime_accounting);
         }
 
         wallcycle_start(wcycle, ewcPMEMESH);
 
-        dvdlambda = 0;
-        clear_mat(vir);
-        gmx_pme_do(pme, 0, natoms, x_pp, f_pp, chargeA, chargeB, box,
-                   cr, maxshift_x, maxshift_y, nrnb, wcycle, vir, ewaldcoeff,
-                   &energy, lambda, &dvdlambda,
-                   GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
+        dvdlambda_q  = 0;
+        dvdlambda_lj = 0;
+        clear_mat(vir_q);
+        clear_mat(vir_lj);
+
+        gmx_pme_do(pme, 0, natoms, x_pp, f_pp,
+                   chargeA, chargeB, c6A, c6B, sigmaA, sigmaB, box,
+                   cr, maxshift_x, maxshift_y, mynrnb, wcycle,
+                   vir_q, ewaldcoeff_q, vir_lj, ewaldcoeff_lj,
+                   &energy_q, &energy_lj, lambda_q, lambda_lj, &dvdlambda_q, &dvdlambda_lj,
+                   pme_flags | GMX_PME_DO_ALL_F | (bEnerVir ? GMX_PME_CALC_ENER_VIR : 0));
 
         cycles = wallcycle_stop(wcycle, ewcPMEMESH);
 
         gmx_pme_send_force_vir_ener(pme_pp,
-                                    f_pp, vir, energy, dvdlambda,
-                                    cycles);
+                                    f_pp, vir_q, energy_q, vir_lj, energy_lj,
+                                    dvdlambda_q, dvdlambda_lj, cycles);
 
         count++;
     } /***** end of quasi-loop, we stop with the break above */
     while (TRUE);
 
+    walltime_accounting_end(walltime_accounting);
+
     return 0;
 }
 
+static void
+calc_initial_lb_coeffs(gmx_pme_t pme, real *local_c6, real *local_sigma)
+{
+    int  i;
+
+    for (i = 0; i < pme->atc[0].n; ++i)
+    {
+        real sigma4;
+
+        sigma4                     = local_sigma[i];
+        sigma4                     = sigma4*sigma4;
+        sigma4                     = sigma4*sigma4;
+        pme->atc[0].coefficient[i] = local_c6[i] / sigma4;
+    }
+}
+
+static void
+calc_next_lb_coeffs(gmx_pme_t pme, real *local_sigma)
+{
+    int  i;
+
+    for (i = 0; i < pme->atc[0].n; ++i)
+    {
+        pme->atc[0].coefficient[i] *= local_sigma[i];
+    }
+}
+
+static void
+do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
+                     gmx_bool bFirst, rvec x[], real *data)
+{
+    int      d;
+    pme_atomcomm_t *atc;
+    atc = &pme->atc[0];
+
+    for (d = pme->ndecompdim - 1; d >= 0; d--)
+    {
+        int             n_d;
+        rvec           *x_d;
+        real           *param_d;
+
+        if (d == pme->ndecompdim - 1)
+        {
+            n_d     = homenr;
+            x_d     = x + start;
+            param_d = data;
+        }
+        else
+        {
+            n_d     = pme->atc[d + 1].n;
+            x_d     = atc->x;
+            param_d = atc->coefficient;
+        }
+        atc      = &pme->atc[d];
+        atc->npd = n_d;
+        if (atc->npd > atc->pd_nalloc)
+        {
+            atc->pd_nalloc = over_alloc_dd(atc->npd);
+            srenew(atc->pd, atc->pd_nalloc);
+        }
+        pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
+        where();
+        /* Redistribute x (only once) and qA/c6A or qB/c6B */
+        if (DOMAINDECOMP(cr))
+        {
+            dd_pmeredist_pos_coeffs(pme, n_d, bFirst, x_d, param_d, atc);
+        }
+    }
+}
+
 int gmx_pme_do(gmx_pme_t pme,
                int start,       int homenr,
                rvec x[],        rvec f[],
                real *chargeA,   real *chargeB,
+               real *c6A,       real *c6B,
+               real *sigmaA,    real *sigmaB,
                matrix box, t_commrec *cr,
                int  maxshift_x, int maxshift_y,
                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
-               matrix vir,      real ewaldcoeff,
-               real *energy,    real lambda,
-               real *dvdlambda, int flags)
+               matrix vir_q,      real ewaldcoeff_q,
+               matrix vir_lj,   real ewaldcoeff_lj,
+               real *energy_q,  real *energy_lj,
+               real lambda_q, real lambda_lj,
+               real *dvdlambda_q, real *dvdlambda_lj,
+               int flags)
 {
-    int     q, d, i, j, ntot, npme;
+    int     d, i, j, k, ntot, npme, grid_index, max_grid_index;
     int     nx, ny, nz;
     int     n_d, local_ny;
     pme_atomcomm_t *atc = NULL;
@@ -4273,14 +4731,18 @@ int gmx_pme_do(gmx_pme_t pme,
     real    *grid       = NULL;
     real    *ptr;
     rvec    *x_d, *f_d;
-    real    *charge = NULL, *q_d;
-    real    energy_AB[2];
-    matrix  vir_AB[2];
+    real    *coefficient = NULL;
+    real    energy_AB[4];
+    matrix  vir_AB[4];
+    real    scale, lambda;
     gmx_bool bClearF;
     gmx_parallel_3dfft_t pfft_setup;
     real *  fftgrid;
     t_complex * cfftgrid;
     int     thread;
+    gmx_bool bFirst, bDoSplines;
+    int fep_state;
+    int fep_states_lj           = pme->bFEP_lj ? 2 : 1;
     const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
     const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
 
@@ -4296,37 +4758,81 @@ int gmx_pme_do(gmx_pme_t pme,
             atc->pd_nalloc = over_alloc_dd(atc->npd);
             srenew(atc->pd, atc->pd_nalloc);
         }
-        atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+        for (d = pme->ndecompdim-1; d >= 0; d--)
+        {
+            atc           = &pme->atc[d];
+            atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
+        }
     }
     else
     {
+        atc = &pme->atc[0];
         /* This could be necessary for TPI */
         pme->atc[0].n = homenr;
+        if (DOMAINDECOMP(cr))
+        {
+            pme_realloc_atomcomm_things(atc);
+        }
+        atc->x = x;
+        atc->f = f;
     }
 
-    for (q = 0; q < (pme->bFEP ? 2 : 1); q++)
+    m_inv_ur0(box, pme->recipbox);
+    bFirst = TRUE;
+
+    /* For simplicity, we construct the splines for all particles if
+     * more than one PME calculations is needed. Some optimization
+     * could be done by keeping track of which atoms have splines
+     * constructed, and construct new splines on each pass for atoms
+     * that don't yet have them.
+     */
+
+    bDoSplines = pme->bFEP || ((flags & GMX_PME_DO_COULOMB) && (flags & GMX_PME_DO_LJ));
+
+    /* We need a maximum of four separate PME calculations:
+     * grid_index=0: Coulomb PME with charges from state A
+     * grid_index=1: Coulomb PME with charges from state B
+     * grid_index=2: LJ PME with C6 from state A
+     * grid_index=3: LJ PME with C6 from state B
+     * For Lorentz-Berthelot combination rules, a separate loop is used to
+     * calculate all the terms
+     */
+
+    /* If we are doing LJ-PME with LB, we only do Q here */
+    max_grid_index = (pme->ljpme_combination_rule == eljpmeLB) ? DO_Q : DO_Q_AND_LJ;
+
+    for (grid_index = 0; grid_index < max_grid_index; ++grid_index)
     {
-        if (q == 0)
+        /* Check if we should do calculations at this grid_index
+         * If grid_index is odd we should be doing FEP
+         * If grid_index < 2 we should be doing electrostatic PME
+         * If grid_index >= 2 we should be doing LJ-PME
+         */
+        if ((grid_index <  DO_Q && (!(flags & GMX_PME_DO_COULOMB) ||
+                                    (grid_index == 1 && !pme->bFEP_q))) ||
+            (grid_index >= DO_Q && (!(flags & GMX_PME_DO_LJ) ||
+                                    (grid_index == 3 && !pme->bFEP_lj))))
         {
-            pmegrid    = &pme->pmegridA;
-            fftgrid    = pme->fftgridA;
-            cfftgrid   = pme->cfftgridA;
-            pfft_setup = pme->pfft_setupA;
-            charge     = chargeA+start;
+            continue;
         }
-        else
+        /* Unpack structure */
+        pmegrid    = &pme->pmegrid[grid_index];
+        fftgrid    = pme->fftgrid[grid_index];
+        cfftgrid   = pme->cfftgrid[grid_index];
+        pfft_setup = pme->pfft_setup[grid_index];
+        switch (grid_index)
         {
-            pmegrid    = &pme->pmegridB;
-            fftgrid    = pme->fftgridB;
-            cfftgrid   = pme->cfftgridB;
-            pfft_setup = pme->pfft_setupB;
-            charge     = chargeB+start;
+            case 0: coefficient = chargeA + start; break;
+            case 1: coefficient = chargeB + start; break;
+            case 2: coefficient = c6A + start; break;
+            case 3: coefficient = c6B + start; break;
         }
+
         grid = pmegrid->grid.grid;
-        /* Unpack structure */
+
         if (debug)
         {
-            fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+            fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
                     cr->nnodes, cr->nodeid);
             fprintf(debug, "Grid = %p\n", (void*)grid);
             if (grid == NULL)
@@ -4336,58 +4842,14 @@ int gmx_pme_do(gmx_pme_t pme,
         }
         where();
 
-        m_inv_ur0(box, pme->recipbox);
-
         if (pme->nnodes == 1)
         {
-            atc = &pme->atc[0];
-            if (DOMAINDECOMP(cr))
-            {
-                atc->n = homenr;
-                pme_realloc_atomcomm_things(atc);
-            }
-            atc->x = x;
-            atc->q = charge;
-            atc->f = f;
+            atc->coefficient = coefficient;
         }
         else
         {
             wallcycle_start(wcycle, ewcPME_REDISTXF);
-            for (d = pme->ndecompdim-1; d >= 0; d--)
-            {
-                if (d == pme->ndecompdim-1)
-                {
-                    n_d = homenr;
-                    x_d = x + start;
-                    q_d = charge;
-                }
-                else
-                {
-                    n_d = pme->atc[d+1].n;
-                    x_d = atc->x;
-                    q_d = atc->q;
-                }
-                atc      = &pme->atc[d];
-                atc->npd = n_d;
-                if (atc->npd > atc->pd_nalloc)
-                {
-                    atc->pd_nalloc = over_alloc_dd(atc->npd);
-                    srenew(atc->pd, atc->pd_nalloc);
-                }
-                atc->maxshift = (atc->dimind == 0 ? maxshift_x : maxshift_y);
-                pme_calc_pidx_wrapper(n_d, pme->recipbox, x_d, atc);
-                where();
-
-                /* Redistribute x (only once) and qA or qB */
-                if (DOMAINDECOMP(cr))
-                {
-                    dd_pmeredist_x_q(pme, n_d, q == 0, x_d, q_d, atc);
-                }
-                else
-                {
-                    pmeredist_pd(pme, TRUE, n_d, q == 0, x_d, q_d, atc);
-                }
-            }
+            do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, coefficient);
             where();
 
             wallcycle_stop(wcycle, ewcPME_REDISTXF);
@@ -4395,25 +4857,25 @@ int gmx_pme_do(gmx_pme_t pme,
 
         if (debug)
         {
-            fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+            fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
                     cr->nodeid, atc->n);
         }
 
-        if (flags & GMX_PME_SPREAD_Q)
+        if (flags & GMX_PME_SPREAD)
         {
             wallcycle_start(wcycle, ewcPME_SPREADGATHER);
 
-            /* Spread the charges on a grid */
-            spread_on_grid(pme, &pme->atc[0], pmegrid, q == 0, TRUE, fftgrid);
+            /* Spread the coefficients on a grid */
+            spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
 
-            if (q == 0)
+            if (bFirst)
             {
                 inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
             }
-            inc_nrnb(nrnb, eNR_SPREADQBSP,
+            inc_nrnb(nrnb, eNR_SPREADBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
 
-            if (pme->nthread == 1)
+            if (!pme->bUseThreads)
             {
                 wrap_periodic_pmegrid(pme, grid);
 
@@ -4421,12 +4883,12 @@ int gmx_pme_do(gmx_pme_t pme,
 #ifdef GMX_MPI
                 if (pme->nnodes > 1)
                 {
-                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_FORWARD);
+                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
                     where();
                 }
 #endif
 
-                copy_pmegrid_to_fftgrid(pme, grid, fftgrid);
+                copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
             }
 
             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
@@ -4451,7 +4913,7 @@ int gmx_pme_do(gmx_pme_t pme,
                     wallcycle_start(wcycle, ewcPME_FFT);
                 }
                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
-                                           fftgrid, cfftgrid, thread, wcycle);
+                                           thread, wcycle);
                 if (thread == 0)
                 {
                     wallcycle_stop(wcycle, ewcPME_FFT);
@@ -4461,16 +4923,28 @@ int gmx_pme_do(gmx_pme_t pme,
                 /* solve in k-space for our local cells */
                 if (thread == 0)
                 {
-                    wallcycle_start(wcycle, ewcPME_SOLVE);
+                    wallcycle_start(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
+                }
+                if (grid_index < DO_Q)
+                {
+                    loop_count =
+                        solve_pme_yzx(pme, cfftgrid, ewaldcoeff_q,
+                                      box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                      bCalcEnerVir,
+                                      pme->nthread, thread);
                 }
-                loop_count =
-                    solve_pme_yzx(pme, cfftgrid, ewaldcoeff,
-                                  box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
-                                  bCalcEnerVir,
-                                  pme->nthread, thread);
+                else
+                {
+                    loop_count =
+                        solve_pme_lj_yzx(pme, &cfftgrid, FALSE, ewaldcoeff_lj,
+                                         box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                         bCalcEnerVir,
+                                         pme->nthread, thread);
+                }
+
                 if (thread == 0)
                 {
-                    wallcycle_stop(wcycle, ewcPME_SOLVE);
+                    wallcycle_stop(wcycle, (grid_index < DO_Q ? ewcPME_SOLVE : ewcLJPME));
                     where();
                     inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
                 }
@@ -4485,7 +4959,7 @@ int gmx_pme_do(gmx_pme_t pme,
                     wallcycle_start(wcycle, ewcPME_FFT);
                 }
                 gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
-                                           cfftgrid, fftgrid, thread, wcycle);
+                                           thread, wcycle);
                 if (thread == 0)
                 {
                     wallcycle_stop(wcycle, ewcPME_FFT);
@@ -4499,10 +4973,13 @@ int gmx_pme_do(gmx_pme_t pme,
                         inc_nrnb(nrnb, eNR_FFT, 2*npme);
                     }
 
+                    /* Note: this wallcycle region is closed below
+                       outside an OpenMP region, so take care if
+                       refactoring code here. */
                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
                 }
 
-                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, pme->nthread, thread);
+                copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
             }
         }
         /* End of thread parallel section.
@@ -4515,7 +4992,7 @@ int gmx_pme_do(gmx_pme_t pme,
 #ifdef GMX_MPI
             if (pme->nnodes > 1)
             {
-                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_QGRID_BACKWARD);
+                gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
             }
 #endif
             where();
@@ -4530,19 +5007,22 @@ int gmx_pme_do(gmx_pme_t pme,
              * atc->f is the actual force array, not a buffer,
              * therefore we should not clear it.
              */
-            bClearF = (q == 0 && PAR(cr));
+            lambda  = grid_index < DO_Q ? lambda_q : lambda_lj;
+            bClearF = (bFirst && PAR(cr));
 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
             for (thread = 0; thread < pme->nthread; thread++)
             {
                 gather_f_bsplines(pme, grid, bClearF, atc,
                                   &atc->spline[thread],
-                                  pme->bFEP ? (q == 0 ? 1.0-lambda : lambda) : 1.0);
+                                  pme->bFEP ? (grid_index % 2 == 0 ? 1.0-lambda : lambda) : 1.0);
             }
 
             where();
 
             inc_nrnb(nrnb, eNR_GATHERFBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+            /* Note: this wallcycle region is opened above inside an OpenMP
+               region, so take care if refactoring code here. */
             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
         }
 
@@ -4551,9 +5031,269 @@ int gmx_pme_do(gmx_pme_t pme,
             /* This should only be called on the master thread
              * and after the threads have synchronized.
              */
-            get_pme_ener_vir(pme, pme->nthread, &energy_AB[q], vir_AB[q]);
+            if (grid_index < 2)
+            {
+                get_pme_ener_vir_q(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
+            }
+            else
+            {
+                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[grid_index], vir_AB[grid_index]);
+            }
         }
-    } /* of q-loop */
+        bFirst = FALSE;
+    } /* of grid_index-loop */
+
+    /* For Lorentz-Berthelot combination rules in LJ-PME, we need to calculate
+     * seven terms. */
+
+    if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
+    {
+        /* Loop over A- and B-state if we are doing FEP */
+        for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
+        {
+            real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
+            if (pme->nnodes == 1)
+            {
+                if (pme->lb_buf1 == NULL)
+                {
+                    pme->lb_buf_nalloc = pme->atc[0].n;
+                    snew(pme->lb_buf1, pme->lb_buf_nalloc);
+                }
+                pme->atc[0].coefficient = pme->lb_buf1;
+                switch (fep_state)
+                {
+                    case 0:
+                        local_c6      = c6A;
+                        local_sigma   = sigmaA;
+                        break;
+                    case 1:
+                        local_c6      = c6B;
+                        local_sigma   = sigmaB;
+                        break;
+                    default:
+                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+                }
+            }
+            else
+            {
+                atc = &pme->atc[0];
+                switch (fep_state)
+                {
+                    case 0:
+                        RedistC6      = c6A;
+                        RedistSigma   = sigmaA;
+                        break;
+                    case 1:
+                        RedistC6      = c6B;
+                        RedistSigma   = sigmaB;
+                        break;
+                    default:
+                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+                }
+                wallcycle_start(wcycle, ewcPME_REDISTXF);
+
+                do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
+                if (pme->lb_buf_nalloc < atc->n)
+                {
+                    pme->lb_buf_nalloc = atc->nalloc;
+                    srenew(pme->lb_buf1, pme->lb_buf_nalloc);
+                    srenew(pme->lb_buf2, pme->lb_buf_nalloc);
+                }
+                local_c6 = pme->lb_buf1;
+                for (i = 0; i < atc->n; ++i)
+                {
+                    local_c6[i] = atc->coefficient[i];
+                }
+                where();
+
+                do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
+                local_sigma = pme->lb_buf2;
+                for (i = 0; i < atc->n; ++i)
+                {
+                    local_sigma[i] = atc->coefficient[i];
+                }
+                where();
+
+                wallcycle_stop(wcycle, ewcPME_REDISTXF);
+            }
+            calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+
+            /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
+            for (grid_index = 2; grid_index < 9; ++grid_index)
+            {
+                /* Unpack structure */
+                pmegrid    = &pme->pmegrid[grid_index];
+                fftgrid    = pme->fftgrid[grid_index];
+                cfftgrid   = pme->cfftgrid[grid_index];
+                pfft_setup = pme->pfft_setup[grid_index];
+                calc_next_lb_coeffs(pme, local_sigma);
+                grid = pmegrid->grid.grid;
+                where();
+
+                if (flags & GMX_PME_SPREAD)
+                {
+                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+                    /* Spread the c6 on a grid */
+                    spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
+
+                    if (bFirst)
+                    {
+                        inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+                    }
+
+                    inc_nrnb(nrnb, eNR_SPREADBSP,
+                             pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+                    if (pme->nthread == 1)
+                    {
+                        wrap_periodic_pmegrid(pme, grid);
+                        /* sum contributions to local grid from other nodes */
+#ifdef GMX_MPI
+                        if (pme->nnodes > 1)
+                        {
+                            gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
+                            where();
+                        }
+#endif
+                        copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
+                    }
+                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+                }
+                /*Here we start a large thread parallel region*/
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+                {
+                    thread = gmx_omp_get_thread_num();
+                    if (flags & GMX_PME_SOLVE)
+                    {
+                        /* do 3d-fft */
+                        if (thread == 0)
+                        {
+                            wallcycle_start(wcycle, ewcPME_FFT);
+                        }
+
+                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+                                                   thread, wcycle);
+                        if (thread == 0)
+                        {
+                            wallcycle_stop(wcycle, ewcPME_FFT);
+                        }
+                        where();
+                    }
+                }
+                bFirst = FALSE;
+            }
+            if (flags & GMX_PME_SOLVE)
+            {
+                /* solve in k-space for our local cells */
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+                {
+                    int loop_count;
+                    thread = gmx_omp_get_thread_num();
+                    if (thread == 0)
+                    {
+                        wallcycle_start(wcycle, ewcLJPME);
+                    }
+
+                    loop_count =
+                        solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
+                                         box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                         bCalcEnerVir,
+                                         pme->nthread, thread);
+                    if (thread == 0)
+                    {
+                        wallcycle_stop(wcycle, ewcLJPME);
+                        where();
+                        inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+                    }
+                }
+            }
+
+            if (bCalcEnerVir)
+            {
+                /* This should only be called on the master thread and
+                 * after the threads have synchronized.
+                 */
+                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
+            }
+
+            if (bCalcF)
+            {
+                bFirst = !(flags & GMX_PME_DO_COULOMB);
+                calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+                for (grid_index = 8; grid_index >= 2; --grid_index)
+                {
+                    /* Unpack structure */
+                    pmegrid    = &pme->pmegrid[grid_index];
+                    fftgrid    = pme->fftgrid[grid_index];
+                    cfftgrid   = pme->cfftgrid[grid_index];
+                    pfft_setup = pme->pfft_setup[grid_index];
+                    grid       = pmegrid->grid.grid;
+                    calc_next_lb_coeffs(pme, local_sigma);
+                    where();
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+                    {
+                        thread = gmx_omp_get_thread_num();
+                        /* do 3d-invfft */
+                        if (thread == 0)
+                        {
+                            where();
+                            wallcycle_start(wcycle, ewcPME_FFT);
+                        }
+
+                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+                                                   thread, wcycle);
+                        if (thread == 0)
+                        {
+                            wallcycle_stop(wcycle, ewcPME_FFT);
+
+                            where();
+
+                            if (pme->nodeid == 0)
+                            {
+                                ntot  = pme->nkx*pme->nky*pme->nkz;
+                                npme  = ntot*log((real)ntot)/log(2.0);
+                                inc_nrnb(nrnb, eNR_FFT, 2*npme);
+                            }
+                            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+                        }
+
+                        copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
+
+                    } /*#pragma omp parallel*/
+
+                    /* distribute local grid to all nodes */
+#ifdef GMX_MPI
+                    if (pme->nnodes > 1)
+                    {
+                        gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
+                    }
+#endif
+                    where();
+
+                    unwrap_periodic_pmegrid(pme, grid);
+
+                    /* interpolate forces for our local atoms */
+                    where();
+                    bClearF = (bFirst && PAR(cr));
+                    scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
+                    scale  *= lb_scale_factor[grid_index-2];
+#pragma omp parallel for num_threads(pme->nthread) schedule(static)
+                    for (thread = 0; thread < pme->nthread; thread++)
+                    {
+                        gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
+                                          &pme->atc[0].spline[thread],
+                                          scale);
+                    }
+                    where();
+
+                    inc_nrnb(nrnb, eNR_GATHERFBSP,
+                             pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+
+                    bFirst = FALSE;
+                } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
+            }     /* if (bCalcF) */
+        }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
+    }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
 
     if (bCalcF && pme->nnodes > 1)
     {
@@ -4576,10 +5316,6 @@ int gmx_pme_do(gmx_pme_t pme,
                 dd_pmeredist_f(pme, atc, n_d, f_d,
                                d == pme->ndecompdim-1 && pme->bPPnode);
             }
-            else
-            {
-                pmeredist_pd(pme, FALSE, n_d, TRUE, f_d, NULL, atc);
-            }
         }
 
         wallcycle_stop(wcycle, ewcPME_REDISTXF);
@@ -4588,34 +5324,64 @@ int gmx_pme_do(gmx_pme_t pme,
 
     if (bCalcEnerVir)
     {
-        if (!pme->bFEP)
+        if (flags & GMX_PME_DO_COULOMB)
         {
-            *energy = energy_AB[0];
-            m_add(vir, vir_AB[0], vir);
+            if (!pme->bFEP_q)
+            {
+                *energy_q = energy_AB[0];
+                m_add(vir_q, vir_AB[0], vir_q);
+            }
+            else
+            {
+                *energy_q       = (1.0-lambda_q)*energy_AB[0] + lambda_q*energy_AB[1];
+                *dvdlambda_q   += energy_AB[1] - energy_AB[0];
+                for (i = 0; i < DIM; i++)
+                {
+                    for (j = 0; j < DIM; j++)
+                    {
+                        vir_q[i][j] += (1.0-lambda_q)*vir_AB[0][i][j] +
+                            lambda_q*vir_AB[1][i][j];
+                    }
+                }
+            }
+            if (debug)
+            {
+                fprintf(debug, "Electrostatic PME mesh energy: %g\n", *energy_q);
+            }
         }
         else
         {
-            *energy     = (1.0-lambda)*energy_AB[0] + lambda*energy_AB[1];
-            *dvdlambda += energy_AB[1] - energy_AB[0];
-            for (i = 0; i < DIM; i++)
+            *energy_q = 0;
+        }
+
+        if (flags & GMX_PME_DO_LJ)
+        {
+            if (!pme->bFEP_lj)
+            {
+                *energy_lj = energy_AB[2];
+                m_add(vir_lj, vir_AB[2], vir_lj);
+            }
+            else
             {
-                for (j = 0; j < DIM; j++)
+                *energy_lj     = (1.0-lambda_lj)*energy_AB[2] + lambda_lj*energy_AB[3];
+                *dvdlambda_lj += energy_AB[3] - energy_AB[2];
+                for (i = 0; i < DIM; i++)
                 {
-                    vir[i][j] += (1.0-lambda)*vir_AB[0][i][j] +
-                        lambda*vir_AB[1][i][j];
+                    for (j = 0; j < DIM; j++)
+                    {
+                        vir_lj[i][j] += (1.0-lambda_lj)*vir_AB[2][i][j] + lambda_lj*vir_AB[3][i][j];
+                    }
                 }
             }
+            if (debug)
+            {
+                fprintf(debug, "Lennard-Jones PME mesh energy: %g\n", *energy_lj);
+            }
+        }
+        else
+        {
+            *energy_lj = 0;
         }
     }
-    else
-    {
-        *energy = 0;
-    }
-
-    if (debug)
-    {
-        fprintf(debug, "PME mesh energy: %g\n", *energy);
-    }
-
     return 0;
 }