Sort all includes in src/gromacs
[alexxy/gromacs.git] / src / gromacs / mdlib / pme.c
index 3d8809e8b360aa38ad061e30cc67bf0d030b6977..67a12ab58db8d0a8f5b4dc4a2f53f75214da1568 100644 (file)
  * /Erik 001109
  */
 
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "gromacs/legacyheaders/pme.h"
+
+#include "config.h"
 
+#include <assert.h>
+#include <math.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <assert.h>
-#include "typedefs.h"
-#include "txtdump.h"
-#include "vec.h"
-#include "smalloc.h"
-#include "coulomb.h"
-#include "gmx_fatal.h"
-#include "pme.h"
-#include "network.h"
-#include "physics.h"
-#include "nrnb.h"
-#include "macros.h"
 
 #include "gromacs/fft/parallel_3dfft.h"
-#include "gromacs/fileio/futil.h"
 #include "gromacs/fileio/pdbio.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
 #include "gromacs/math/gmxcomplex.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/vec.h"
 #include "gromacs/timing/cyclecounter.h"
 #include "gromacs/timing/wallcycle.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/futil.h"
 #include "gromacs/utility/gmxmpi.h"
 #include "gromacs/utility/gmxomp.h"
+#include "gromacs/utility/smalloc.h"
 
 /* Include the SIMD macro file and then check for support */
 #include "gromacs/simd/simd.h"
@@ -304,13 +307,10 @@ typedef struct gmx_pme {
     int        nthread;       /* The number of threads doing PME on our rank */
 
     gmx_bool   bPPnode;       /* Node also does particle-particle forces */
-
     gmx_bool   bFEP;          /* Compute Free energy contribution */
     gmx_bool   bFEP_q;
     gmx_bool   bFEP_lj;
-
     int        nkx, nky, nkz; /* Grid dimensions */
-
     gmx_bool   bP3M;          /* Do P3M: optimize the influence function */
     int        pme_order;
     real       epsilon_r;
@@ -754,7 +754,7 @@ static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
     {
         if (atc->count[atc->nodeid] + nsend != n)
         {
-            gmx_fatal(FARGS, "%d particles communicated to PME node %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
+            gmx_fatal(FARGS, "%d particles communicated to PME rank %d are more than 2/3 times the cut-off out of the domain decomposition cell of their charge group in dimension %c.\n"
                       "This usually means that your system is not well equilibrated.",
                       n - (atc->count[atc->nodeid] + nsend),
                       pme->nodeid, 'x'+atc->dimind);
@@ -774,7 +774,7 @@ static void dd_pmeredist_pos_coeffs(gmx_pme_t pme,
             /* Communicate the count */
             if (debug)
             {
-                fprintf(debug, "dimind %d PME node %d send to node %d: %d\n",
+                fprintf(debug, "dimind %d PME rank %d send to rank %d: %d\n",
                         atc->dimind, atc->nodeid, commnode[i], scount);
             }
             pme_dd_sendrecv(atc, FALSE, i,
@@ -949,7 +949,7 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Copy data to contiguous send buffer */
         if (debug)
         {
-            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, send_id,
                     pme->pmegrid_start_iy,
                     send_index0-pme->pmegrid_start_iy,
@@ -981,7 +981,7 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
         /* Get data from contiguous recv buffer */
         if (debug)
         {
-            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, recv_id,
                     pme->pmegrid_start_iy,
                     recv_index0-pme->pmegrid_start_iy,
@@ -1045,12 +1045,12 @@ static void gmx_sum_qgrid_dd(gmx_pme_t pme, real *grid, int direction)
 
         if (debug)
         {
-            fprintf(debug, "PME send node %d %d -> %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME send rank %d %d -> %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, send_id,
                     pme->pmegrid_start_ix,
                     send_index0-pme->pmegrid_start_ix,
                     send_index0-pme->pmegrid_start_ix+send_nindex);
-            fprintf(debug, "PME recv node %d %d <- %d grid start %d Communicating %d to %d\n",
+            fprintf(debug, "PME recv rank %d %d <- %d grid start %d Communicating %d to %d\n",
                     pme->nodeid, overlap->nodeid, recv_id,
                     pme->pmegrid_start_ix,
                     recv_index0-pme->pmegrid_start_ix,
@@ -1100,13 +1100,12 @@ static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid,
     {
 #ifdef DEBUG_PME
         FILE *fp, *fp2;
-        char  fn[STRLEN], format[STRLEN];
+        char  fn[STRLEN];
         real  val;
         sprintf(fn, "pmegrid%d.pdb", pme->nodeid);
         fp = gmx_ffopen(fn, "w");
         sprintf(fn, "pmegrid%d.txt", pme->nodeid);
         fp2 = gmx_ffopen(fn, "w");
-        sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
 #endif
 
         for (ix = 0; ix < local_fft_ndata[XX]; ix++)
@@ -1122,8 +1121,8 @@ static int copy_pmegrid_to_fftgrid(gmx_pme_t pme, real *pmegrid, real *fftgrid,
                     val = 100*pmegrid[pmeidx];
                     if (pmegrid[pmeidx] != 0)
                     {
-                        fprintf(fp, format, "ATOM", pmeidx, "CA", "GLY", ' ', pmeidx, ' ',
-                                5.0*ix, 5.0*iy, 5.0*iz, 1.0, val);
+                        gmx_fprintf_pdb_atomline(fp, epdbATOM, pmeidx, "CA", ' ', "GLY", ' ', pmeidx, ' ',
+                                                 5.0*ix, 5.0*iy, 5.0*iz, 1.0, val, "");
                     }
                     if (pmegrid[pmeidx] != 0)
                     {
@@ -1437,7 +1436,7 @@ static void spread_coefficients_bsplines_thread(pmegrid_t                    *pm
 #define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_BSPLINE(4);
 #endif
@@ -1446,7 +1445,7 @@ static void spread_coefficients_bsplines_thread(pmegrid_t                    *pm
 #ifdef PME_SIMD4_SPREAD_GATHER
 #define PME_SPREAD_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_BSPLINE(5);
 #endif
@@ -2600,7 +2599,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 #define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 4
 #endif
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_FSPLINE(4);
 #endif
@@ -2609,7 +2608,7 @@ static void gather_f_bsplines(gmx_pme_t pme, real *grid,
 #ifdef PME_SIMD4_SPREAD_GATHER
 #define PME_GATHER_F_SIMD4_ALIGNED
 #define PME_ORDER 5
-#include "pme_simd4.h"
+#include "gromacs/mdlib/pme_simd4.h"
 #else
                     DO_FSPLINE(5);
 #endif
@@ -3134,8 +3133,9 @@ init_overlap_comm(pme_overlap_t *  ol,
     {
         /* s2g0 the local interpolation grid start.
          * s2g1 the local interpolation grid end.
-         * Because grid overlap communication only goes forward,
-         * the grid the slabs for fft's should be rounded down.
+         * Since in calc_pidx we divide particles, and not grid lines,
+         * spatially uniform along dimension x or y, we need to round
+         * s2g0 down and s2g1 up.
          */
         ol->s2g0[i] = ( i   *ndata + 0       )/nnodes;
         ol->s2g1[i] = ((i+1)*ndata + nnodes-1)/nnodes + norder - 1;
@@ -3357,7 +3357,7 @@ void gmx_pme_check_restrictions(int pme_order,
             *bValidSettings = FALSE;
             return;
         }
-        gmx_fatal(FARGS, "The number of PME grid lines per node along x is %g. But when using OpenMP threads, the number of grid lines per node along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use less nodes along x (and possibly more along y and/or z) by specifying -dd manually.",
+        gmx_fatal(FARGS, "The number of PME grid lines per rank along x is %g. But when using OpenMP threads, the number of grid lines per rank along x should be >= pme_order (%d) or = pmeorder-1. To resolve this issue, use fewer ranks along x (and possibly more along y and/or z) by specifying -dd manually.",
                   nkx/(double)nnodes_major, pme_order);
     }
 
@@ -3410,7 +3410,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         MPI_Comm_size(pme->mpi_comm, &pme->nnodes);
         if (pme->nnodes != nnodes_major*nnodes_minor)
         {
-            gmx_incons("PME node count mismatch");
+            gmx_incons("PME rank count mismatch");
         }
     }
     else
@@ -3459,7 +3459,7 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
         {
             if (pme->nnodes % nnodes_major != 0)
             {
-                gmx_incons("For 2D PME decomposition, #PME nodes must be divisible by the number of nodes in the major dimension");
+                gmx_incons("For 2D PME decomposition, #PME ranks must be divisible by the number of ranks in the major dimension");
             }
             pme->ndecompdim = 2;
 
@@ -3546,8 +3546,8 @@ int gmx_pme_init(gmx_pme_t *         pmedata,
                     "\n"
                     "NOTE: The load imbalance in PME FFT and solve is %d%%.\n"
                     "      For optimal PME load balancing\n"
-                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_nodes_x (%d)\n"
-                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_nodes_y (%d)\n"
+                    "      PME grid_x (%d) and grid_y (%d) should be divisible by #PME_ranks_x (%d)\n"
+                    "      and PME grid_y (%d) and grid_z (%d) should be divisible by #PME_ranks_y (%d)\n"
                     "\n",
                     (int)((imbal-1)*100 + 0.5),
                     pme->nkx, pme->nky, pme->nnodes_major,
@@ -3837,7 +3837,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
     int  fft_my, fft_mz;
     int  buf_my = -1;
     int  nsx, nsy, nsz;
-    ivec ne;
+    ivec localcopy_end;
     int  offx, offy, offz, x, y, z, i0, i0t;
     int  sx, sy, sz, fx, fy, fz, tx1, ty1, tz1, ox, oy, oz;
     gmx_bool bClearBufX, bClearBufY, bClearBufXY, bClearBuf;
@@ -3873,8 +3873,14 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
 
     for (d = 0; d < DIM; d++)
     {
-        ne[d] = min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
-                    local_fft_ndata[d]);
+        /* Determine up to where our thread needs to copy from the
+         * thread-local charge spreading grid to the rank-local FFT grid.
+         * This is up to our spreading grid end minus order-1 and
+         * not beyond the local FFT grid.
+         */
+        localcopy_end[d] =
+            min(pmegrid->offset[d]+pmegrid->n[d]-(pmegrid->order-1),
+                local_fft_ndata[d]);
     }
 
     offx = pmegrid->offset[XX];
@@ -3889,7 +3895,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
     /* Now loop over all the thread data blocks that contribute
      * to the grid region we (our thread) are operating on.
      */
-    /* Note that ffy_nx/y is equal to the number of grid points
+    /* Note that fft_nx/y is equal to the number of grid points
      * between the first point of our node grid and the one of the next node.
      */
     for (sx = 0; sx >= -pmegrids->nthread_comm[XX]; sx--)
@@ -3905,12 +3911,15 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
         }
         pmegrid_g = &pmegrids->grid_th[fx*pmegrids->nc[YY]*pmegrids->nc[ZZ]];
         ox       += pmegrid_g->offset[XX];
+        /* Determine the end of our part of the source grid */
         if (!bCommX)
         {
-            tx1 = min(ox + pmegrid_g->n[XX], ne[XX]);
+            /* Use our thread local source grid and target grid part */
+            tx1 = min(ox + pmegrid_g->n[XX], localcopy_end[XX]);
         }
         else
         {
+            /* Use our thread local source grid and the spreading range */
             tx1 = min(ox + pmegrid_g->n[XX], pme->pme_order);
         }
 
@@ -3927,12 +3936,15 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
             }
             pmegrid_g = &pmegrids->grid_th[fy*pmegrids->nc[ZZ]];
             oy       += pmegrid_g->offset[YY];
+            /* Determine the end of our part of the source grid */
             if (!bCommY)
             {
-                ty1 = min(oy + pmegrid_g->n[YY], ne[YY]);
+                /* Use our thread local source grid and target grid part */
+                ty1 = min(oy + pmegrid_g->n[YY], localcopy_end[YY]);
             }
             else
             {
+                /* Use our thread local source grid and the spreading range */
                 ty1 = min(oy + pmegrid_g->n[YY], pme->pme_order);
             }
 
@@ -3947,7 +3959,7 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                 }
                 pmegrid_g = &pmegrids->grid_th[fz];
                 oz       += pmegrid_g->offset[ZZ];
-                tz1       = min(oz + pmegrid_g->n[ZZ], ne[ZZ]);
+                tz1       = min(oz + pmegrid_g->n[ZZ], localcopy_end[ZZ]);
 
                 if (sx == 0 && sy == 0 && sz == 0)
                 {
@@ -4003,7 +4015,13 @@ reduce_threadgrid_overlap(gmx_pme_t pme,
                     if (bCommY)
                     {
                         commbuf = commbuf_y;
-                        buf_my  = ty1 - offy;
+                        /* The y-size of the communication buffer is set by
+                         * the overlap of the grid part of our local slab
+                         * with the part starting at the next slab.
+                         */
+                        buf_my  =
+                            pme->overlap[1].s2g1[pme->nodeid_minor] -
+                            pme->overlap[1].s2g0[pme->nodeid_minor+1];
                         if (bCommX)
                         {
                             /* We index commbuf modulo the local grid size */
@@ -4116,6 +4134,12 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
             sendptr = overlap->sendbuf + send_index0*local_fft_ndata[ZZ];
             recvptr = overlap->recvbuf;
 
+            if (debug != NULL)
+            {
+                fprintf(debug, "PME fftgrid comm y %2d x %2d x %2d\n",
+                        local_fft_ndata[XX], send_nindex, local_fft_ndata[ZZ]);
+            }
+
 #ifdef GMX_MPI
             MPI_Sendrecv(sendptr, send_size_y*datasize, GMX_MPI_REAL,
                          send_id, ipulse,
@@ -4182,7 +4206,7 @@ static void sum_fftgrid_dd(gmx_pme_t pme, real *fftgrid, int grid_index)
 
         if (debug != NULL)
         {
-            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
+            fprintf(debug, "PME fftgrid comm %2d x %2d x %2d\n",
                     send_nindex, local_fft_ndata[YY], local_fft_ndata[ZZ]);
         }
 
@@ -4683,14 +4707,12 @@ do_redist_pos_coeffs(gmx_pme_t pme, t_commrec *cr, int start, int homenr,
     }
 }
 
-/* TODO: when adding free-energy support, sigmaB will no longer be
- * unused */
 int gmx_pme_do(gmx_pme_t pme,
                int start,       int homenr,
                rvec x[],        rvec f[],
                real *chargeA,   real *chargeB,
                real *c6A,       real *c6B,
-               real *sigmaA,    real gmx_unused *sigmaB,
+               real *sigmaA,    real *sigmaB,
                matrix box, t_commrec *cr,
                int  maxshift_x, int maxshift_y,
                t_nrnb *nrnb,    gmx_wallcycle_t wcycle,
@@ -4701,7 +4723,7 @@ int gmx_pme_do(gmx_pme_t pme,
                real *dvdlambda_q, real *dvdlambda_lj,
                int flags)
 {
-    int     d, i, j, ntot, npme, grid_index, max_grid_index;
+    int     d, i, j, k, ntot, npme, grid_index, max_grid_index;
     int     nx, ny, nz;
     int     n_d, local_ny;
     pme_atomcomm_t *atc = NULL;
@@ -4719,6 +4741,8 @@ int gmx_pme_do(gmx_pme_t pme,
     t_complex * cfftgrid;
     int     thread;
     gmx_bool bFirst, bDoSplines;
+    int fep_state;
+    int fep_states_lj           = pme->bFEP_lj ? 2 : 1;
     const gmx_bool bCalcEnerVir = flags & GMX_PME_CALC_ENER_VIR;
     const gmx_bool bCalcF       = flags & GMX_PME_CALC_F;
 
@@ -4808,7 +4832,7 @@ int gmx_pme_do(gmx_pme_t pme,
 
         if (debug)
         {
-            fprintf(debug, "PME: nnodes = %d, nodeid = %d\n",
+            fprintf(debug, "PME: number of ranks = %d, rank = %d\n",
                     cr->nnodes, cr->nodeid);
             fprintf(debug, "Grid = %p\n", (void*)grid);
             if (grid == NULL)
@@ -4833,7 +4857,7 @@ int gmx_pme_do(gmx_pme_t pme,
 
         if (debug)
         {
-            fprintf(debug, "Node= %6d, pme local particles=%6d\n",
+            fprintf(debug, "Rank= %6d, pme local particles=%6d\n",
                     cr->nodeid, atc->n);
         }
 
@@ -4949,6 +4973,9 @@ int gmx_pme_do(gmx_pme_t pme,
                         inc_nrnb(nrnb, eNR_FFT, 2*npme);
                     }
 
+                    /* Note: this wallcycle region is closed below
+                       outside an OpenMP region, so take care if
+                       refactoring code here. */
                     wallcycle_start(wcycle, ewcPME_SPREADGATHER);
                 }
 
@@ -4994,6 +5021,8 @@ int gmx_pme_do(gmx_pme_t pme,
 
             inc_nrnb(nrnb, eNR_GATHERFBSP,
                      pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+            /* Note: this wallcycle region is opened above inside an OpenMP
+               region, so take care if refactoring code here. */
             wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
         }
 
@@ -5019,227 +5048,252 @@ int gmx_pme_do(gmx_pme_t pme,
 
     if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB)
     {
-        real *local_c6 = NULL, *local_sigma = NULL;
-        if (pme->nnodes == 1)
+        /* Loop over A- and B-state if we are doing FEP */
+        for (fep_state = 0; fep_state < fep_states_lj; ++fep_state)
         {
-            if (pme->lb_buf1 == NULL)
+            real *local_c6 = NULL, *local_sigma = NULL, *RedistC6 = NULL, *RedistSigma = NULL;
+            if (pme->nnodes == 1)
             {
-                pme->lb_buf_nalloc = pme->atc[0].n;
-                snew(pme->lb_buf1, pme->lb_buf_nalloc);
-            }
-            pme->atc[0].coefficient = pme->lb_buf1;
-            local_c6                = c6A;
-            local_sigma             = sigmaA;
-        }
-        else
-        {
-            atc = &pme->atc[0];
-
-            wallcycle_start(wcycle, ewcPME_REDISTXF);
-
-            do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, c6A);
-            if (pme->lb_buf_nalloc < atc->n)
-            {
-                pme->lb_buf_nalloc = atc->nalloc;
-                srenew(pme->lb_buf1, pme->lb_buf_nalloc);
-                srenew(pme->lb_buf2, pme->lb_buf_nalloc);
-            }
-            local_c6 = pme->lb_buf1;
-            for (i = 0; i < atc->n; ++i)
-            {
-                local_c6[i] = atc->coefficient[i];
-            }
-            where();
-
-            do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, sigmaA);
-            local_sigma = pme->lb_buf2;
-            for (i = 0; i < atc->n; ++i)
-            {
-                local_sigma[i] = atc->coefficient[i];
+                if (pme->lb_buf1 == NULL)
+                {
+                    pme->lb_buf_nalloc = pme->atc[0].n;
+                    snew(pme->lb_buf1, pme->lb_buf_nalloc);
+                }
+                pme->atc[0].coefficient = pme->lb_buf1;
+                switch (fep_state)
+                {
+                    case 0:
+                        local_c6      = c6A;
+                        local_sigma   = sigmaA;
+                        break;
+                    case 1:
+                        local_c6      = c6B;
+                        local_sigma   = sigmaB;
+                        break;
+                    default:
+                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+                }
             }
-            where();
-
-            wallcycle_stop(wcycle, ewcPME_REDISTXF);
-        }
-        calc_initial_lb_coeffs(pme, local_c6, local_sigma);
-
-        /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
-        for (grid_index = 2; grid_index < 9; ++grid_index)
-        {
-            /* Unpack structure */
-            pmegrid    = &pme->pmegrid[grid_index];
-            fftgrid    = pme->fftgrid[grid_index];
-            cfftgrid   = pme->cfftgrid[grid_index];
-            pfft_setup = pme->pfft_setup[grid_index];
-            calc_next_lb_coeffs(pme, local_sigma);
-            grid = pmegrid->grid.grid;
-            where();
-
-            if (flags & GMX_PME_SPREAD)
+            else
             {
-                wallcycle_start(wcycle, ewcPME_SPREADGATHER);
-                /* Spread the c6 on a grid */
-                spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
+                atc = &pme->atc[0];
+                switch (fep_state)
+                {
+                    case 0:
+                        RedistC6      = c6A;
+                        RedistSigma   = sigmaA;
+                        break;
+                    case 1:
+                        RedistC6      = c6B;
+                        RedistSigma   = sigmaB;
+                        break;
+                    default:
+                        gmx_incons("Trying to access wrong FEP-state in LJ-PME routine");
+                }
+                wallcycle_start(wcycle, ewcPME_REDISTXF);
 
-                if (bFirst)
+                do_redist_pos_coeffs(pme, cr, start, homenr, bFirst, x, RedistC6);
+                if (pme->lb_buf_nalloc < atc->n)
                 {
-                    inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
+                    pme->lb_buf_nalloc = atc->nalloc;
+                    srenew(pme->lb_buf1, pme->lb_buf_nalloc);
+                    srenew(pme->lb_buf2, pme->lb_buf_nalloc);
                 }
-                inc_nrnb(nrnb, eNR_SPREADBSP,
-                         pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
-                if (pme->nthread == 1)
+                local_c6 = pme->lb_buf1;
+                for (i = 0; i < atc->n; ++i)
                 {
-                    wrap_periodic_pmegrid(pme, grid);
+                    local_c6[i] = atc->coefficient[i];
+                }
+                where();
 
-                    /* sum contributions to local grid from other nodes */
-#ifdef GMX_MPI
-                    if (pme->nnodes > 1)
-                    {
-                        gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
-                        where();
-                    }
-#endif
-                    copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
+                do_redist_pos_coeffs(pme, cr, start, homenr, FALSE, x, RedistSigma);
+                local_sigma = pme->lb_buf2;
+                for (i = 0; i < atc->n; ++i)
+                {
+                    local_sigma[i] = atc->coefficient[i];
                 }
+                where();
 
-                wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+                wallcycle_stop(wcycle, ewcPME_REDISTXF);
             }
+            calc_initial_lb_coeffs(pme, local_c6, local_sigma);
 
-            /*Here we start a large thread parallel region*/
-#pragma omp parallel num_threads(pme->nthread) private(thread)
+            /*Seven terms in LJ-PME with LB, grid_index < 2 reserved for electrostatics*/
+            for (grid_index = 2; grid_index < 9; ++grid_index)
             {
-                thread = gmx_omp_get_thread_num();
-                if (flags & GMX_PME_SOLVE)
+                /* Unpack structure */
+                pmegrid    = &pme->pmegrid[grid_index];
+                fftgrid    = pme->fftgrid[grid_index];
+                cfftgrid   = pme->cfftgrid[grid_index];
+                pfft_setup = pme->pfft_setup[grid_index];
+                calc_next_lb_coeffs(pme, local_sigma);
+                grid = pmegrid->grid.grid;
+                where();
+
+                if (flags & GMX_PME_SPREAD)
                 {
-                    /* do 3d-fft */
-                    if (thread == 0)
+                    wallcycle_start(wcycle, ewcPME_SPREADGATHER);
+                    /* Spread the c6 on a grid */
+                    spread_on_grid(pme, &pme->atc[0], pmegrid, bFirst, TRUE, fftgrid, bDoSplines, grid_index);
+
+                    if (bFirst)
                     {
-                        wallcycle_start(wcycle, ewcPME_FFT);
+                        inc_nrnb(nrnb, eNR_WEIGHTS, DIM*atc->n);
                     }
 
-                    gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
-                                               thread, wcycle);
-                    if (thread == 0)
+                    inc_nrnb(nrnb, eNR_SPREADBSP,
+                             pme->pme_order*pme->pme_order*pme->pme_order*atc->n);
+                    if (pme->nthread == 1)
                     {
-                        wallcycle_stop(wcycle, ewcPME_FFT);
+                        wrap_periodic_pmegrid(pme, grid);
+                        /* sum contributions to local grid from other nodes */
+#ifdef GMX_MPI
+                        if (pme->nnodes > 1)
+                        {
+                            gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_FORWARD);
+                            where();
+                        }
+#endif
+                        copy_pmegrid_to_fftgrid(pme, grid, fftgrid, grid_index);
                     }
-                    where();
+                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
                 }
-            }
-            bFirst = FALSE;
-        }
-        if (flags & GMX_PME_SOLVE)
-        {
-            int loop_count;
-            /* solve in k-space for our local cells */
+                /*Here we start a large thread parallel region*/
 #pragma omp parallel num_threads(pme->nthread) private(thread)
-            {
-                thread = gmx_omp_get_thread_num();
-                if (thread == 0)
                 {
-                    wallcycle_start(wcycle, ewcLJPME);
-                }
+                    thread = gmx_omp_get_thread_num();
+                    if (flags & GMX_PME_SOLVE)
+                    {
+                        /* do 3d-fft */
+                        if (thread == 0)
+                        {
+                            wallcycle_start(wcycle, ewcPME_FFT);
+                        }
 
-                loop_count =
-                    solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
-                                     box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
-                                     bCalcEnerVir,
-                                     pme->nthread, thread);
-                if (thread == 0)
-                {
-                    wallcycle_stop(wcycle, ewcLJPME);
-                    where();
-                    inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_REAL_TO_COMPLEX,
+                                                   thread, wcycle);
+                        if (thread == 0)
+                        {
+                            wallcycle_stop(wcycle, ewcPME_FFT);
+                        }
+                        where();
+                    }
                 }
+                bFirst = FALSE;
             }
-        }
-
-        if (bCalcEnerVir)
-        {
-            /* This should only be called on the master thread and
-             * after the threads have synchronized.
-             */
-            get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2], vir_AB[2]);
-        }
-
-        if (bCalcF)
-        {
-            bFirst = !(flags & GMX_PME_DO_COULOMB);
-            calc_initial_lb_coeffs(pme, local_c6, local_sigma);
-            for (grid_index = 8; grid_index >= 2; --grid_index)
+            if (flags & GMX_PME_SOLVE)
             {
-                /* Unpack structure */
-                pmegrid    = &pme->pmegrid[grid_index];
-                fftgrid    = pme->fftgrid[grid_index];
-                cfftgrid   = pme->cfftgrid[grid_index];
-                pfft_setup = pme->pfft_setup[grid_index];
-                grid       = pmegrid->grid.grid;
-                calc_next_lb_coeffs(pme, local_sigma);
-                where();
+                /* solve in k-space for our local cells */
 #pragma omp parallel num_threads(pme->nthread) private(thread)
                 {
+                    int loop_count;
                     thread = gmx_omp_get_thread_num();
-                    /* do 3d-invfft */
                     if (thread == 0)
                     {
-                        where();
-                        wallcycle_start(wcycle, ewcPME_FFT);
+                        wallcycle_start(wcycle, ewcLJPME);
                     }
 
-                    gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
-                                               thread, wcycle);
+                    loop_count =
+                        solve_pme_lj_yzx(pme, &pme->cfftgrid[2], TRUE, ewaldcoeff_lj,
+                                         box[XX][XX]*box[YY][YY]*box[ZZ][ZZ],
+                                         bCalcEnerVir,
+                                         pme->nthread, thread);
                     if (thread == 0)
                     {
-                        wallcycle_stop(wcycle, ewcPME_FFT);
-
+                        wallcycle_stop(wcycle, ewcLJPME);
                         where();
+                        inc_nrnb(nrnb, eNR_SOLVEPME, loop_count);
+                    }
+                }
+            }
+
+            if (bCalcEnerVir)
+            {
+                /* This should only be called on the master thread and
+                 * after the threads have synchronized.
+                 */
+                get_pme_ener_vir_lj(pme, pme->nthread, &energy_AB[2+fep_state], vir_AB[2+fep_state]);
+            }
 
-                        if (pme->nodeid == 0)
+            if (bCalcF)
+            {
+                bFirst = !(flags & GMX_PME_DO_COULOMB);
+                calc_initial_lb_coeffs(pme, local_c6, local_sigma);
+                for (grid_index = 8; grid_index >= 2; --grid_index)
+                {
+                    /* Unpack structure */
+                    pmegrid    = &pme->pmegrid[grid_index];
+                    fftgrid    = pme->fftgrid[grid_index];
+                    cfftgrid   = pme->cfftgrid[grid_index];
+                    pfft_setup = pme->pfft_setup[grid_index];
+                    grid       = pmegrid->grid.grid;
+                    calc_next_lb_coeffs(pme, local_sigma);
+                    where();
+#pragma omp parallel num_threads(pme->nthread) private(thread)
+                    {
+                        thread = gmx_omp_get_thread_num();
+                        /* do 3d-invfft */
+                        if (thread == 0)
                         {
-                            ntot  = pme->nkx*pme->nky*pme->nkz;
-                            npme  = ntot*log((real)ntot)/log(2.0);
-                            inc_nrnb(nrnb, eNR_FFT, 2*npme);
+                            where();
+                            wallcycle_start(wcycle, ewcPME_FFT);
+                        }
+
+                        gmx_parallel_3dfft_execute(pfft_setup, GMX_FFT_COMPLEX_TO_REAL,
+                                                   thread, wcycle);
+                        if (thread == 0)
+                        {
+                            wallcycle_stop(wcycle, ewcPME_FFT);
+
+                            where();
+
+                            if (pme->nodeid == 0)
+                            {
+                                ntot  = pme->nkx*pme->nky*pme->nkz;
+                                npme  = ntot*log((real)ntot)/log(2.0);
+                                inc_nrnb(nrnb, eNR_FFT, 2*npme);
+                            }
+                            wallcycle_start(wcycle, ewcPME_SPREADGATHER);
                         }
-                        wallcycle_start(wcycle, ewcPME_SPREADGATHER);
-                    }
 
-                    copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
+                        copy_fftgrid_to_pmegrid(pme, fftgrid, grid, grid_index, pme->nthread, thread);
 
-                } /*#pragma omp parallel*/
+                    } /*#pragma omp parallel*/
 
-                /* distribute local grid to all nodes */
+                    /* distribute local grid to all nodes */
 #ifdef GMX_MPI
-                if (pme->nnodes > 1)
-                {
-                    gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
-                }
+                    if (pme->nnodes > 1)
+                    {
+                        gmx_sum_qgrid_dd(pme, grid, GMX_SUM_GRID_BACKWARD);
+                    }
 #endif
-                where();
+                    where();
 
-                unwrap_periodic_pmegrid(pme, grid);
+                    unwrap_periodic_pmegrid(pme, grid);
 
-                /* interpolate forces for our local atoms */
-                where();
-                bClearF = (bFirst && PAR(cr));
-                scale   = pme->bFEP ? (grid_index < 9 ? 1.0-lambda_lj : lambda_lj) : 1.0;
-                scale  *= lb_scale_factor[grid_index-2];
+                    /* interpolate forces for our local atoms */
+                    where();
+                    bClearF = (bFirst && PAR(cr));
+                    scale   = pme->bFEP ? (fep_state < 1 ? 1.0-lambda_lj : lambda_lj) : 1.0;
+                    scale  *= lb_scale_factor[grid_index-2];
 #pragma omp parallel for num_threads(pme->nthread) schedule(static)
-                for (thread = 0; thread < pme->nthread; thread++)
-                {
-                    gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
-                                      &pme->atc[0].spline[thread],
-                                      scale);
-                }
-                where();
+                    for (thread = 0; thread < pme->nthread; thread++)
+                    {
+                        gather_f_bsplines(pme, grid, bClearF, &pme->atc[0],
+                                          &pme->atc[0].spline[thread],
+                                          scale);
+                    }
+                    where();
 
-                inc_nrnb(nrnb, eNR_GATHERFBSP,
-                         pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
-                wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
+                    inc_nrnb(nrnb, eNR_GATHERFBSP,
+                             pme->pme_order*pme->pme_order*pme->pme_order*pme->atc[0].n);
+                    wallcycle_stop(wcycle, ewcPME_SPREADGATHER);
 
-                bFirst = FALSE;
-            } /*for (grid_index = 8; grid_index >= 2; --grid_index)*/
-        }     /*if (bCalcF)*/
-    }         /*if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
+                    bFirst = FALSE;
+                } /* for (grid_index = 8; grid_index >= 2; --grid_index) */
+            }     /* if (bCalcF) */
+        }         /* for (fep_state = 0; fep_state < fep_states_lj; ++fep_state) */
+    }             /* if ((flags & GMX_PME_DO_LJ) && pme->ljpme_combination_rule == eljpmeLB) */
 
     if (bCalcF && pme->nnodes > 1)
     {