Sort all includes in src/gromacs

[alexxy/gromacs.git] / src / gromacs / mdlib / forcerec.c
diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c

index 83da939f397af67a521273e35041c7a1ec6ff56d..68edaad1c1d50322aa00e3fd33f83625bd8ba092 100644 (file)
--- a/src/gromacs/mdlib/forcerec.c
+++ b/src/gromacs/mdlib/forcerec.c
@@ -1,89 +1,83 @@
-/* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
+/*
+ * This file is part of the GROMACS molecular simulation package.
   *
- *
- *                This source code is part of
- *
- *                 G   R   O   M   A   C   S
- *
- *          GROningen MAchine for Chemical Simulations
- *
- *                        VERSION 3.2.0
- * Written by David van der Spoel, Erik Lindahl, Berk Hess, and others.
   * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
- * Copyright (c) 2001-2004, The GROMACS development team,
- * check out http://www.gromacs.org for more information.
-
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
+ * Copyright (c) 2001-2004, The GROMACS development team.
+ * Copyright (c) 2013,2014, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
   * of the License, or (at your option) any later version.
   *
- * If you want to redistribute modifications, please consider that
- * scientific software is very special. Version control is crucial -
- * bugs must be traceable. We will be happy to consider code for
- * inclusion in the official distribution, but derived work must not
- * be called official GROMACS. Details are found in the README & COPYING
- * files - if they are missing, get the official version at www.gromacs.org.
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
   *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the papers on the package - you can find them in the top README file.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
   *
- * For more info, check our website at http://www.gromacs.org
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
   *
- * And Hey:
- * GROwing Monsters And Cloning Shrimps
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
   */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "config.h"
  
+#include <assert.h>
  #include <math.h>
  #include <string.h>
-#include <assert.h>
-#include "sysstuff.h"
-#include "typedefs.h"
-#include "vec.h"
-#include "maths.h"
-#include "macros.h"
-#include "smalloc.h"
-#include "macros.h"
-#include "gmx_fatal.h"
-#include "gmx_fatal_collective.h"
-#include "physics.h"
-#include "force.h"
-#include "tables.h"
-#include "nonbonded.h"
-#include "invblock.h"
-#include "names.h"
-#include "network.h"
-#include "pbc.h"
-#include "ns.h"
-#include "mshift.h"
-#include "txtdump.h"
-#include "coulomb.h"
-#include "md_support.h"
-#include "md_logging.h"
-#include "domdec.h"
-#include "partdec.h"
-#include "qmmm.h"
-#include "copyrite.h"
-#include "mtop_util.h"
-#include "nbnxn_search.h"
-#include "nbnxn_atomdata.h"
-#include "nbnxn_consts.h"
-#include "statutil.h"
-#include "gmx_omp_nthreads.h"
-#include "gmx_detect_hardware.h"
-
-#ifdef _MSC_VER
-/* MSVC definition for __cpuid() */
-#include <intrin.h>
-#endif
  
-#include "types/nbnxn_cuda_types_ext.h"
-#include "gpu_utils.h"
-#include "nbnxn_cuda_data_mgmt.h"
-#include "pmalloc_cuda.h"
+#include "gromacs/legacyheaders/copyrite.h"
+#include "gromacs/legacyheaders/coulomb.h"
+#include "gromacs/legacyheaders/domdec.h"
+#include "gromacs/legacyheaders/force.h"
+#include "gromacs/legacyheaders/gmx_detect_hardware.h"
+#include "gromacs/legacyheaders/gmx_omp_nthreads.h"
+#include "gromacs/legacyheaders/gpu_utils.h"
+#include "gromacs/legacyheaders/inputrec.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/md_logging.h"
+#include "gromacs/legacyheaders/md_support.h"
+#include "gromacs/legacyheaders/names.h"
+#include "gromacs/legacyheaders/network.h"
+#include "gromacs/legacyheaders/nonbonded.h"
+#include "gromacs/legacyheaders/ns.h"
+#include "gromacs/legacyheaders/pmalloc_cuda.h"
+#include "gromacs/legacyheaders/qmmm.h"
+#include "gromacs/legacyheaders/tables.h"
+#include "gromacs/legacyheaders/txtdump.h"
+#include "gromacs/legacyheaders/typedefs.h"
+#include "gromacs/legacyheaders/types/commrec.h"
+#include "gromacs/legacyheaders/types/nbnxn_cuda_types_ext.h"
+#include "gromacs/math/units.h"
+#include "gromacs/math/utilities.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/nb_verlet.h"
+#include "gromacs/mdlib/nbnxn_atomdata.h"
+#include "gromacs/mdlib/nbnxn_consts.h"
+#include "gromacs/mdlib/nbnxn_search.h"
+#include "gromacs/mdlib/nbnxn_simd.h"
+#include "gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/topology/mtop_util.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/smalloc.h"
  
  t_forcerec *mk_forcerec(void)
  {
@@ -156,6 +150,82 @@ static real *mk_nbfp(const gmx_ffparams_t *idef, gmx_bool bBHAM)
      return nbfp;
  }
  
+static real *make_ljpme_c6grid(const gmx_ffparams_t *idef, t_forcerec *fr)
+{
+    int   i, j, k, atnr;
+    real  c6, c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+    real *grid;
+
+    /* For LJ-PME simulations, we correct the energies with the reciprocal space
+     * inside of the cut-off. To do this the non-bonded kernels needs to have
+     * access to the C6-values used on the reciprocal grid in pme.c
+     */
+
+    atnr = idef->atnr;
+    snew(grid, 2*atnr*atnr);
+    for (i = k = 0; (i < atnr); i++)
+    {
+        for (j = 0; (j < atnr); j++, k++)
+        {
+            c6i  = idef->iparams[i*(atnr+1)].lj.c6;
+            c12i = idef->iparams[i*(atnr+1)].lj.c12;
+            c6j  = idef->iparams[j*(atnr+1)].lj.c6;
+            c12j = idef->iparams[j*(atnr+1)].lj.c12;
+            c6   = sqrt(c6i * c6j);
+            if (fr->ljpme_combination_rule == eljpmeLB
+                && !gmx_numzero(c6) && !gmx_numzero(c12i) && !gmx_numzero(c12j))
+            {
+                sigmai = pow(c12i / c6i, 1.0/6.0);
+                sigmaj = pow(c12j / c6j, 1.0/6.0);
+                epsi   = c6i * c6i / c12i;
+                epsj   = c6j * c6j / c12j;
+                c6     = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
+            }
+            /* Store the elements at the same relative positions as C6 in nbfp in order
+             * to simplify access in the kernels
+             */
+            grid[2*(atnr*i+j)] = c6*6.0;
+        }
+    }
+    return grid;
+}
+
+static real *mk_nbfp_combination_rule(const gmx_ffparams_t *idef, int comb_rule)
+{
+    real *nbfp;
+    int   i, j, k, atnr;
+    real  c6i, c6j, c12i, c12j, epsi, epsj, sigmai, sigmaj;
+    real  c6, c12;
+
+    atnr = idef->atnr;
+    snew(nbfp, 2*atnr*atnr);
+    for (i = 0; i < atnr; ++i)
+    {
+        for (j = 0; j < atnr; ++j)
+        {
+            c6i  = idef->iparams[i*(atnr+1)].lj.c6;
+            c12i = idef->iparams[i*(atnr+1)].lj.c12;
+            c6j  = idef->iparams[j*(atnr+1)].lj.c6;
+            c12j = idef->iparams[j*(atnr+1)].lj.c12;
+            c6   = sqrt(c6i  * c6j);
+            c12  = sqrt(c12i * c12j);
+            if (comb_rule == eCOMB_ARITHMETIC
+                && !gmx_numzero(c6) && !gmx_numzero(c12))
+            {
+                sigmai = pow(c12i / c6i, 1.0/6.0);
+                sigmaj = pow(c12j / c6j, 1.0/6.0);
+                epsi   = c6i * c6i / c12i;
+                epsj   = c6j * c6j / c12j;
+                c6     = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 6);
+                c12    = sqrt(epsi * epsj) * pow(0.5*(sigmai+sigmaj), 12);
+            }
+            C6(nbfp, atnr, i, j)   = c6*6.0;
+            C12(nbfp, atnr, i, j)  = c12*12.0;
+        }
+    }
+    return nbfp;
+}
+
  /* This routine sets fr->solvent_opt to the most common solvent in the
   * system, e.g. esolSPC or esolTIP4P. It will also mark each charge group in
   * the fr->solvent_type array with the correct type (or esolNO).
@@ -191,15 +261,15 @@ check_solvent_cg(const gmx_moltype_t    *molt,
                   int                     cginfo,
                   int                    *cg_sp)
  {
-    const t_blocka     *  excl;
+    const t_blocka       *excl;
      t_atom               *atom;
      int                   j, k;
      int                   j0, j1, nj;
      gmx_bool              perturbed;
      gmx_bool              has_vdw[4];
      gmx_bool              match;
-    real                  tmp_charge[4];
-    int                   tmp_vdwtype[4];
+    real                  tmp_charge[4]  = { 0.0 }; /* init to zero to make gcc4.8 happy */
+    int                   tmp_vdwtype[4] = { 0 };   /* init to zero to make gcc4.8 happy */
      int                   tjA;
      gmx_bool              qm;
      solvent_parameters_t *solvent_parameters;
@@ -512,10 +582,6 @@ check_solvent(FILE  *                fp,
          bestsol = esolNO;
      }
  
-#ifdef DISABLE_WATER_NLIST
-    bestsol = esolNO;
-#endif
-
      fr->nWatMol = 0;
      for (mb = 0; mb < mtop->nmolblock; mb++)
      {
@@ -554,6 +620,7 @@ enum {
  
  static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
                                     t_forcerec *fr, gmx_bool bNoSolvOpt,
+                                   gmx_bool *bFEP_NonBonded,
                                     gmx_bool *bExcl_IntraCGAll_InterCGNone)
  {
      const t_block        *cgs;
@@ -568,7 +635,7 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
      int                  *a_con;
      int                   ftype;
      int                   ia;
-    gmx_bool              bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ;
+    gmx_bool              bId, *bExcl, bExclIntraAll, bExclInter, bHaveVDW, bHaveQ, bHavePerturbedAtoms;
  
      ncg_tot = ncg_mtop(mtop);
      snew(cginfo_mb, mtop->nmolblock);
@@ -586,6 +653,7 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
          }
      }
  
+    *bFEP_NonBonded               = FALSE;
      *bExcl_IntraCGAll_InterCGNone = TRUE;
  
      excl_nalloc = 10;
@@ -681,10 +749,11 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
                  /* bExclIntraAll: all intra cg interactions excluded
                   * bExclInter:    any inter cg interactions excluded
                   */
-                bExclIntraAll = TRUE;
-                bExclInter    = FALSE;
-                bHaveVDW      = FALSE;
-                bHaveQ        = FALSE;
+                bExclIntraAll       = TRUE;
+                bExclInter          = FALSE;
+                bHaveVDW            = FALSE;
+                bHaveQ              = FALSE;
+                bHavePerturbedAtoms = FALSE;
                  for (ai = a0; ai < a1; ai++)
                  {
                      /* Check VDW and electrostatic interactions */
@@ -693,6 +762,8 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
                      bHaveQ  = bHaveQ    || (molt->atoms.atom[ai].q != 0 ||
                                              molt->atoms.atom[ai].qB != 0);
  
+                    bHavePerturbedAtoms = bHavePerturbedAtoms || (PERTURBED(molt->atoms.atom[ai]) != 0);
+
                      /* Clear the exclusion list for atom ai */
                      for (aj = a0; aj < a1; aj++)
                      {
@@ -753,6 +824,11 @@ static cginfo_mb_t *init_cginfo_mb(FILE *fplog, const gmx_mtop_t *mtop,
                  {
                      SET_CGINFO_HAS_Q(cginfo[cgm+cg]);
                  }
+                if (bHavePerturbedAtoms && fr->efep != efepNO)
+                {
+                    SET_CGINFO_FEP(cginfo[cgm+cg]);
+                    *bFEP_NonBonded = TRUE;
+                }
                  /* Store the charge group size */
                  SET_CGINFO_NATOMS(cginfo[cgm+cg], a1-a0);
  
@@ -827,47 +903,58 @@ static int *cginfo_expand(int nmb, cginfo_mb_t *cgi_mb)
  
  static void set_chargesum(FILE *log, t_forcerec *fr, const gmx_mtop_t *mtop)
  {
-    double         qsum, q2sum, q;
+    /*This now calculates sum for q and c6*/
+    double         qsum, q2sum, q, c6sum, c6;
      int            mb, nmol, i;
      const t_atoms *atoms;
  
-    qsum  = 0;
-    q2sum = 0;
+    qsum   = 0;
+    q2sum  = 0;
+    c6sum  = 0;
      for (mb = 0; mb < mtop->nmolblock; mb++)
      {
          nmol  = mtop->molblock[mb].nmol;
          atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
          for (i = 0; i < atoms->nr; i++)
          {
-            q      = atoms->atom[i].q;
-            qsum  += nmol*q;
-            q2sum += nmol*q*q;
+            q       = atoms->atom[i].q;
+            qsum   += nmol*q;
+            q2sum  += nmol*q*q;
+            c6      = mtop->ffparams.iparams[atoms->atom[i].type*(mtop->ffparams.atnr+1)].lj.c6;
+            c6sum  += nmol*c6;
          }
      }
-    fr->qsum[0]  = qsum;
-    fr->q2sum[0] = q2sum;
+    fr->qsum[0]   = qsum;
+    fr->q2sum[0]  = q2sum;
+    fr->c6sum[0]  = c6sum;
+
      if (fr->efep != efepNO)
      {
-        qsum  = 0;
-        q2sum = 0;
+        qsum   = 0;
+        q2sum  = 0;
+        c6sum  = 0;
          for (mb = 0; mb < mtop->nmolblock; mb++)
          {
              nmol  = mtop->molblock[mb].nmol;
              atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
              for (i = 0; i < atoms->nr; i++)
              {
-                q      = atoms->atom[i].qB;
-                qsum  += nmol*q;
-                q2sum += nmol*q*q;
+                q       = atoms->atom[i].qB;
+                qsum   += nmol*q;
+                q2sum  += nmol*q*q;
+                c6      = mtop->ffparams.iparams[atoms->atom[i].typeB*(mtop->ffparams.atnr+1)].lj.c6;
+                c6sum  += nmol*c6;
              }
-            fr->qsum[1]  = qsum;
-            fr->q2sum[1] = q2sum;
+            fr->qsum[1]   = qsum;
+            fr->q2sum[1]  = q2sum;
+            fr->c6sum[1]  = c6sum;
          }
      }
      else
      {
-        fr->qsum[1]  = fr->qsum[0];
-        fr->q2sum[1] = fr->q2sum[0];
+        fr->qsum[1]   = fr->qsum[0];
+        fr->q2sum[1]  = fr->q2sum[0];
+        fr->c6sum[1]  = fr->c6sum[0];
      }
      if (log)
      {
@@ -898,20 +985,36 @@ void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
      const t_atoms  *atoms, *atoms_tpi;
      const t_blocka *excl;
      int             mb, nmol, nmolc, i, j, tpi, tpj, j1, j2, k, n, nexcl, q;
-#if (defined SIZEOF_LONG_LONG_INT) && (SIZEOF_LONG_LONG_INT >= 8)
-    long long int   npair, npair_ij, tmpi, tmpj;
-#else
-    double          npair, npair_ij, tmpi, tmpj;
-#endif
+    gmx_int64_t     npair, npair_ij, tmpi, tmpj;
      double          csix, ctwelve;
      int             ntp, *typecount;
      gmx_bool        bBHAM;
      real           *nbfp;
+    real           *nbfp_comb = NULL;
  
      ntp   = fr->ntype;
      bBHAM = fr->bBHAM;
      nbfp  = fr->nbfp;
  
+    /* For LJ-PME, we want to correct for the difference between the
+     * actual C6 values and the C6 values used by the LJ-PME based on
+     * combination rules. */
+
+    if (EVDW_PME(fr->vdwtype))
+    {
+        nbfp_comb = mk_nbfp_combination_rule(&mtop->ffparams,
+                                             (fr->ljpme_combination_rule == eljpmeLB) ? eCOMB_ARITHMETIC : eCOMB_GEOMETRIC);
+        for (tpi = 0; tpi < ntp; ++tpi)
+        {
+            for (tpj = 0; tpj < ntp; ++tpj)
+            {
+                C6(nbfp_comb, ntp, tpi, tpj) =
+                    C6(nbfp, ntp, tpi, tpj) - C6(nbfp_comb, ntp, tpi, tpj);
+                C12(nbfp_comb, ntp, tpi, tpj) = C12(nbfp, ntp, tpi, tpj);
+            }
+        }
+        nbfp = nbfp_comb;
+    }
      for (q = 0; q < (fr->efep == efepNO ? 1 : 2); q++)
      {
          csix    = 0;
@@ -922,23 +1025,8 @@ void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
          {
              /* Count the types so we avoid natoms^2 operations */
              snew(typecount, ntp);
-            for (mb = 0; mb < mtop->nmolblock; mb++)
-            {
-                nmol  = mtop->molblock[mb].nmol;
-                atoms = &mtop->moltype[mtop->molblock[mb].type].atoms;
-                for (i = 0; i < atoms->nr; i++)
-                {
-                    if (q == 0)
-                    {
-                        tpi = atoms->atom[i].type;
-                    }
-                    else
-                    {
-                        tpi = atoms->atom[i].typeB;
-                    }
-                    typecount[tpi] += nmol;
-                }
-            }
+            gmx_mtop_count_atomtypes(mtop, q, typecount);
+
              for (tpi = 0; tpi < ntp; tpi++)
              {
                  for (tpj = tpi; tpj < ntp; tpj++)
@@ -1103,6 +1191,12 @@ void set_avcsixtwelve(FILE *fplog, t_forcerec *fr, const gmx_mtop_t *mtop)
          fr->avcsix[q]    = csix;
          fr->avctwelve[q] = ctwelve;
      }
+
+    if (EVDW_PME(fr->vdwtype))
+    {
+        sfree(nbfp_comb);
+    }
+
      if (fplog != NULL)
      {
          if (fr->eDispCorr == edispcAllEner ||
@@ -1439,7 +1533,7 @@ gmx_bool can_use_allvsall(const t_inputrec *ir, gmx_bool bPrintNote, t_commrec *
  
      if (bAllvsAll && fp && MASTER(cr))
      {
-        fprintf(fp, "\nUsing accelerated all-vs-all kernels.\n\n");
+        fprintf(fp, "\nUsing SIMD all-vs-all kernels.\n\n");
      }
  
      return bAllvsAll;
@@ -1472,6 +1566,23 @@ static void init_forcerec_f_threads(t_forcerec *fr, int nenergrp)
  }
  
  
+gmx_bool nbnxn_acceleration_supported(FILE             *fplog,
+                                      const t_commrec  *cr,
+                                      const t_inputrec *ir,
+                                      gmx_bool          bGPU)
+{
+    if (!bGPU && (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB))
+    {
+        md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n",
+                      bGPU ? "GPUs" : "SIMD kernels",
+                      bGPU ? "CPU only" : "plain-C kernels");
+        return FALSE;
+    }
+
+    return TRUE;
+}
+
+
  static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
                                    int                         *kernel_type,
                                    int                         *ewald_excl)
@@ -1485,21 +1596,41 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
          *kernel_type = nbnxnk4xN_SIMD_4xN;
  #endif
  #ifdef GMX_NBNXN_SIMD_2XNN
-        /* We expect the 2xNN kernels to be faster in most cases */
          *kernel_type = nbnxnk4xN_SIMD_2xNN;
  #endif
  
-#if defined GMX_NBNXN_SIMD_4XN && defined GMX_X86_AVX_256
-        if (EEL_RF(ir->coulombtype) || ir->coulombtype == eelCUT)
+#if defined GMX_NBNXN_SIMD_2XNN && defined GMX_NBNXN_SIMD_4XN
+        /* We need to choose if we want 2x(N+N) or 4xN kernels.
+         * Currently this is based on the SIMD acceleration choice,
+         * but it might be better to decide this at runtime based on CPU.
+         *
+         * 4xN calculates more (zero) interactions, but has less pair-search
+         * work and much better kernel instruction scheduling.
+         *
+         * Up till now we have only seen that on Intel Sandy/Ivy Bridge,
+         * which doesn't have FMA, both the analytical and tabulated Ewald
+         * kernels have similar pair rates for 4x8 and 2x(4+4), so we choose
+         * 2x(4+4) because it results in significantly fewer pairs.
+         * For RF, the raw pair rate of the 4x8 kernel is higher than 2x(4+4),
+         * 10% with HT, 50% without HT. As we currently don't detect the actual
+         * use of HT, use 4x8 to avoid a potential performance hit.
+         * On Intel Haswell 4x8 is always faster.
+         */
+        *kernel_type = nbnxnk4xN_SIMD_4xN;
+
+#ifndef GMX_SIMD_HAVE_FMA
+        if (EEL_PME_EWALD(ir->coulombtype) ||
+            EVDW_PME(ir->vdwtype))
          {
-            /* The raw pair rate of the 4x8 kernel is higher than 2x(4+4),
-             * 10% with HT, 50% without HT, but extra zeros interactions
-             * can compensate. As we currently don't detect the actual use
-             * of HT, switch to 4x8 to avoid a potential performance hit.
+            /* We have Ewald kernels without FMA (Intel Sandy/Ivy Bridge).
+             * There are enough instructions to make 2x(4+4) efficient.
               */
-            *kernel_type = nbnxnk4xN_SIMD_4xN;
+            *kernel_type = nbnxnk4xN_SIMD_2xNN;
          }
  #endif
+#endif  /* GMX_NBNXN_SIMD_2XNN && GMX_NBNXN_SIMD_4XN */
+
+
          if (getenv("GMX_NBNXN_SIMD_4XN") != NULL)
          {
  #ifdef GMX_NBNXN_SIMD_4XN
@@ -1517,11 +1648,17 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
  #endif
          }
  
-        /* Analytical Ewald exclusion correction is only an option in the
-         * x86 SIMD kernel. This is faster in single precision
-         * on Bulldozer and slightly faster on Sandy Bridge.
+        /* Analytical Ewald exclusion correction is only an option in
+         * the SIMD kernel.
+         * Since table lookup's don't parallelize with SIMD, analytical
+         * will probably always be faster for a SIMD width of 8 or more.
+         * With FMA analytical is sometimes faster for a width if 4 as well.
+         * On BlueGene/Q, this is faster regardless of precision.
+         * In single precision, this is faster on Bulldozer.
           */
-#if (defined GMX_X86_AVX_128_FMA || defined GMX_X86_AVX_256) && !defined GMX_DOUBLE
+#if GMX_SIMD_REAL_WIDTH >= 8 || \
+        (GMX_SIMD_REAL_WIDTH >= 4 && defined GMX_SIMD_HAVE_FMA && !defined GMX_DOUBLE) || \
+        defined GMX_SIMD_IBM_QPX
          *ewald_excl = ewaldexclAnalytical;
  #endif
          if (getenv("GMX_NBNXN_EWALD_TABLE") != NULL)
@@ -1534,7 +1671,7 @@ static void pick_nbnxn_kernel_cpu(const t_inputrec gmx_unused *ir,
          }
  
      }
-#endif /* GMX_X86_SSE2 */
+#endif /* GMX_NBNXN_SIMD */
  }
  
  
@@ -1552,37 +1689,20 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
          case nbnxnk4xN_SIMD_4xN:
          case nbnxnk4xN_SIMD_2xNN:
  #ifdef GMX_NBNXN_SIMD
-#ifdef GMX_X86_SSE2
-            /* We have x86 SSE2 compatible SIMD */
-#ifdef GMX_X86_AVX_128_FMA
-            returnvalue = "AVX-128-FMA";
-#else
-#if defined GMX_X86_AVX_256 || defined __AVX__
-            /* x86 SIMD intrinsics can be converted to SSE or AVX depending
-             * on compiler flags. As we use nearly identical intrinsics,
-             * compiling for AVX without an AVX macros effectively results
-             * in AVX kernels.
-             * For gcc we check for __AVX__
-             * At least a check for icc should be added (if there is a macro)
-             */
-#if defined GMX_X86_AVX_256 && !defined GMX_NBNXN_HALF_WIDTH_SIMD
-            returnvalue = "AVX-256";
-#else
-            returnvalue = "AVX-128";
-#endif
-#else
-#ifdef GMX_X86_SSE4_1
-            returnvalue  = "SSE4.1";
+#if defined GMX_SIMD_X86_SSE2
+            returnvalue = "SSE2";
+#elif defined GMX_SIMD_X86_SSE4_1
+            returnvalue = "SSE4.1";
+#elif defined GMX_SIMD_X86_AVX_128_FMA
+            returnvalue = "AVX_128_FMA";
+#elif defined GMX_SIMD_X86_AVX_256
+            returnvalue = "AVX_256";
+#elif defined GMX_SIMD_X86_AVX2_256
+            returnvalue = "AVX2_256";
  #else
-            returnvalue  = "SSE2";
-#endif
-#endif
+            returnvalue = "SIMD";
  #endif
-#else /* GMX_X86_SSE2 */
-            /* not GMX_X86_SSE2, but other SIMD */
-            returnvalue  = "SIMD";
-#endif /* GMX_X86_SSE2 */
-#else /* GMX_NBNXN_SIMD */
+#else  /* GMX_NBNXN_SIMD */
              returnvalue = "not available";
  #endif /* GMX_NBNXN_SIMD */
              break;
@@ -1600,7 +1720,7 @@ const char *lookup_nbnxn_kernel_name(int kernel_type)
  
  static void pick_nbnxn_kernel(FILE                *fp,
                                const t_commrec     *cr,
-                              gmx_bool             use_cpu_acceleration,
+                              gmx_bool             use_simd_kernels,
                                gmx_bool             bUseGPU,
                                gmx_bool             bEmulateGPU,
                                const t_inputrec    *ir,
@@ -1629,7 +1749,11 @@ static void pick_nbnxn_kernel(FILE                *fp,
  
      if (*kernel_type == nbnxnkNotSet)
      {
-        if (use_cpu_acceleration)
+        /* LJ PME with LB combination rule does 7 mesh operations.
+         * This so slow that we don't compile SIMD non-bonded kernels for that.
+         */
+        if (use_simd_kernels &&
+            nbnxn_acceleration_supported(fp, cr, ir, FALSE))
          {
              pick_nbnxn_kernel_cpu(ir, kernel_type, ewald_excl);
          }
@@ -1645,6 +1769,15 @@ static void pick_nbnxn_kernel(FILE                *fp,
                  lookup_nbnxn_kernel_name(*kernel_type),
                  nbnxn_kernel_pairlist_simple(*kernel_type) ? NBNXN_CPU_CLUSTER_I_SIZE : NBNXN_GPU_CLUSTER_SIZE,
                  nbnxn_kernel_to_cj_size(*kernel_type));
+
+        if (nbnxnk4x4_PlainC == *kernel_type ||
+            nbnxnk8x8x8_PlainC == *kernel_type)
+        {
+            md_print_warn(cr, fp,
+                          "WARNING: Using the slow %s kernels. This should\n"
+                          "not happen during routine usage on supported platforms.\n\n",
+                          lookup_nbnxn_kernel_name(*kernel_type));
+        }
      }
  }
  
@@ -1652,7 +1785,8 @@ static void pick_nbnxn_resources(const t_commrec     *cr,
                                   const gmx_hw_info_t *hwinfo,
                                   gmx_bool             bDoNonbonded,
                                   gmx_bool            *bUseGPU,
-                                 gmx_bool            *bEmulateGPU)
+                                 gmx_bool            *bEmulateGPU,
+                                 const gmx_gpu_opt_t *gpu_opt)
  {
      gmx_bool bEmulateGPUEnvVarSet;
      char     gpu_err_str[STRLEN];
@@ -1673,21 +1807,24 @@ static void pick_nbnxn_resources(const t_commrec     *cr,
       * Note that you should freezing the system as otherwise it will explode.
       */
      *bEmulateGPU = (bEmulateGPUEnvVarSet ||
-                    (!bDoNonbonded && hwinfo->bCanUseGPU));
+                    (!bDoNonbonded &&
+                     gpu_opt->ncuda_dev_use > 0));
  
      /* Enable GPU mode when GPUs are available or no GPU emulation is requested.
       */
-    if (hwinfo->bCanUseGPU && !(*bEmulateGPU))
+    if (gpu_opt->ncuda_dev_use > 0 && !(*bEmulateGPU))
      {
          /* Each PP node will use the intra-node id-th device from the
           * list of detected/selected GPUs. */
-        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str, &hwinfo->gpu_info))
+        if (!init_gpu(cr->rank_pp_intranode, gpu_err_str,
+                      &hwinfo->gpu_info, gpu_opt))
          {
              /* At this point the init should never fail as we made sure that
               * we have all the GPUs we need. If it still does, we'll bail. */
-            gmx_fatal(FARGS, "On node %d failed to initialize GPU #%d: %s",
+            gmx_fatal(FARGS, "On rank %d failed to initialize GPU #%d: %s",
                        cr->nodeid,
-                      get_gpu_device_id(&hwinfo->gpu_info, cr->rank_pp_intranode),
+                      get_gpu_device_id(&hwinfo->gpu_info, gpu_opt,
+                                        cr->rank_pp_intranode),
                        gpu_err_str);
          }
  
@@ -1727,11 +1864,10 @@ static void init_ewald_f_table(interaction_const_t *ic,
  
      if (bUsesSimpleTables)
      {
-        /* With a spacing of 0.0005 we are at the force summation accuracy
-         * for the SSE kernels for "normal" atomistic simulations.
+        /* Get the Ewald table spacing based on Coulomb and/or LJ
+         * Ewald coefficients and rtol.
           */
-        ic->tabq_scale = ewald_spline3_table_scale(ic->ewaldcoeff,
-                                                   ic->rcoulomb);
+        ic->tabq_scale = ewald_spline3_table_scale(ic);
  
          maxr           = (rtab > ic->rcoulomb) ? rtab : ic->rcoulomb;
          ic->tabq_size  = (int)(maxr*ic->tabq_scale) + 2;
@@ -1747,12 +1883,28 @@ static void init_ewald_f_table(interaction_const_t *ic,
      sfree_aligned(ic->tabq_coul_F);
      sfree_aligned(ic->tabq_coul_V);
  
-    /* Create the original table data in FDV0 */
-    snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
-    snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
-    snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
-    table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
-                                ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff);
+    sfree_aligned(ic->tabq_vdw_FDV0);
+    sfree_aligned(ic->tabq_vdw_F);
+    sfree_aligned(ic->tabq_vdw_V);
+
+    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+    {
+        /* Create the original table data in FDV0 */
+        snew_aligned(ic->tabq_coul_FDV0, ic->tabq_size*4, 32);
+        snew_aligned(ic->tabq_coul_F, ic->tabq_size, 32);
+        snew_aligned(ic->tabq_coul_V, ic->tabq_size, 32);
+        table_spline3_fill_ewald_lr(ic->tabq_coul_F, ic->tabq_coul_V, ic->tabq_coul_FDV0,
+                                    ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff_q, v_q_ewald_lr);
+    }
+
+    if (EVDW_PME(ic->vdwtype))
+    {
+        snew_aligned(ic->tabq_vdw_FDV0, ic->tabq_size*4, 32);
+        snew_aligned(ic->tabq_vdw_F, ic->tabq_size, 32);
+        snew_aligned(ic->tabq_vdw_V, ic->tabq_size, 32);
+        table_spline3_fill_ewald_lr(ic->tabq_vdw_F, ic->tabq_vdw_V, ic->tabq_vdw_FDV0,
+                                    ic->tabq_size, 1/ic->tabq_scale, ic->ewaldcoeff_lj, v_lj_ewald_lr);
+    }
  }
  
  void init_interaction_const_tables(FILE                *fp,
@@ -1762,7 +1914,7 @@ void init_interaction_const_tables(FILE                *fp,
  {
      real spacing;
  
-    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype))
+    if (ic->eeltype == eelEWALD || EEL_PME(ic->eeltype) || EVDW_PME(ic->vdwtype))
      {
          init_ewald_f_table(ic, bUsesSimpleTables, rtab);
  
@@ -1774,10 +1926,52 @@ void init_interaction_const_tables(FILE                *fp,
      }
  }
  
-void init_interaction_const(FILE                 *fp,
-                            interaction_const_t **interaction_const,
-                            const t_forcerec     *fr,
-                            real                  rtab)
+static void clear_force_switch_constants(shift_consts_t *sc)
+{
+    sc->c2   = 0;
+    sc->c3   = 0;
+    sc->cpot = 0;
+}
+
+static void force_switch_constants(real p,
+                                   real rsw, real rc,
+                                   shift_consts_t *sc)
+{
+    /* Here we determine the coefficient for shifting the force to zero
+     * between distance rsw and the cut-off rc.
+     * For a potential of r^-p, we have force p*r^-(p+1).
+     * But to save flops we absorb p in the coefficient.
+     * Thus we get:
+     * force/p   = r^-(p+1) + c2*r^2 + c3*r^3
+     * potential = r^-p + c2/3*r^3 + c3/4*r^4 + cpot
+     */
+    sc->c2   =  ((p + 1)*rsw - (p + 4)*rc)/(pow(rc, p + 2)*pow(rc - rsw, 2));
+    sc->c3   = -((p + 1)*rsw - (p + 3)*rc)/(pow(rc, p + 2)*pow(rc - rsw, 3));
+    sc->cpot = -pow(rc, -p) + p*sc->c2/3*pow(rc - rsw, 3) + p*sc->c3/4*pow(rc - rsw, 4);
+}
+
+static void potential_switch_constants(real rsw, real rc,
+                                       switch_consts_t *sc)
+{
+    /* The switch function is 1 at rsw and 0 at rc.
+     * The derivative and second derivate are zero at both ends.
+     * rsw        = max(r - r_switch, 0)
+     * sw         = 1 + c3*rsw^3 + c4*rsw^4 + c5*rsw^5
+     * dsw        = 3*c3*rsw^2 + 4*c4*rsw^3 + 5*c5*rsw^4
+     * force      = force*dsw - potential*sw
+     * potential *= sw
+     */
+    sc->c3 = -10*pow(rc - rsw, -3);
+    sc->c4 =  15*pow(rc - rsw, -4);
+    sc->c5 =  -6*pow(rc - rsw, -5);
+}
+
+static void
+init_interaction_const(FILE                       *fp,
+                       const t_commrec gmx_unused *cr,
+                       interaction_const_t       **interaction_const,
+                       const t_forcerec           *fr,
+                       real                        rtab)
  {
      interaction_const_t *ic;
      gmx_bool             bUsesSimpleTables = TRUE;
@@ -1789,31 +1983,67 @@ void init_interaction_const(FILE                 *fp,
      snew_aligned(ic->tabq_coul_F, 16, 32);
      snew_aligned(ic->tabq_coul_V, 16, 32);
  
-    ic->rlist       = fr->rlist;
-    ic->rlistlong   = fr->rlistlong;
+    ic->rlist           = fr->rlist;
+    ic->rlistlong       = fr->rlistlong;
  
      /* Lennard-Jones */
-    ic->rvdw        = fr->rvdw;
-    if (fr->vdw_modifier == eintmodPOTSHIFT)
-    {
-        ic->sh_invrc6 = pow(ic->rvdw, -6.0);
-    }
-    else
-    {
-        ic->sh_invrc6 = 0;
+    ic->vdwtype         = fr->vdwtype;
+    ic->vdw_modifier    = fr->vdw_modifier;
+    ic->rvdw            = fr->rvdw;
+    ic->rvdw_switch     = fr->rvdw_switch;
+    ic->ewaldcoeff_lj   = fr->ewaldcoeff_lj;
+    ic->ljpme_comb_rule = fr->ljpme_combination_rule;
+    ic->sh_lj_ewald     = 0;
+    clear_force_switch_constants(&ic->dispersion_shift);
+    clear_force_switch_constants(&ic->repulsion_shift);
+
+    switch (ic->vdw_modifier)
+    {
+        case eintmodPOTSHIFT:
+            /* Only shift the potential, don't touch the force */
+            ic->dispersion_shift.cpot = -pow(ic->rvdw, -6.0);
+            ic->repulsion_shift.cpot  = -pow(ic->rvdw, -12.0);
+            if (EVDW_PME(ic->vdwtype))
+            {
+                real crc2;
+
+                crc2            = sqr(ic->ewaldcoeff_lj*ic->rvdw);
+                ic->sh_lj_ewald = (exp(-crc2)*(1 + crc2 + 0.5*crc2*crc2) - 1)*pow(ic->rvdw, -6.0);
+            }
+            break;
+        case eintmodFORCESWITCH:
+            /* Switch the force, switch and shift the potential */
+            force_switch_constants(6.0, ic->rvdw_switch, ic->rvdw,
+                                   &ic->dispersion_shift);
+            force_switch_constants(12.0, ic->rvdw_switch, ic->rvdw,
+                                   &ic->repulsion_shift);
+            break;
+        case eintmodPOTSWITCH:
+            /* Switch the potential and force */
+            potential_switch_constants(ic->rvdw_switch, ic->rvdw,
+                                       &ic->vdw_switch);
+            break;
+        case eintmodNONE:
+        case eintmodEXACTCUTOFF:
+            /* Nothing to do here */
+            break;
+        default:
+            gmx_incons("unimplemented potential modifier");
      }
  
+    ic->sh_invrc6 = -ic->dispersion_shift.cpot;
+
      /* Electrostatics */
-    ic->eeltype     = fr->eeltype;
-    ic->rcoulomb    = fr->rcoulomb;
-    ic->epsilon_r   = fr->epsilon_r;
-    ic->epsfac      = fr->epsfac;
+    ic->eeltype          = fr->eeltype;
+    ic->coulomb_modifier = fr->coulomb_modifier;
+    ic->rcoulomb         = fr->rcoulomb;
+    ic->epsilon_r        = fr->epsilon_r;
+    ic->epsfac           = fr->epsfac;
+    ic->ewaldcoeff_q     = fr->ewaldcoeff_q;
  
-    /* Ewald */
-    ic->ewaldcoeff  = fr->ewaldcoeff;
      if (fr->coulomb_modifier == eintmodPOTSHIFT)
      {
-        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff*ic->rcoulomb);
+        ic->sh_ewald = gmx_erfc(ic->ewaldcoeff_q*ic->rcoulomb);
      }
      else
      {
@@ -1844,15 +2074,23 @@ void init_interaction_const(FILE                 *fp,
  
      if (fp != NULL)
      {
-        fprintf(fp, "Potential shift: LJ r^-12: %.3f r^-6 %.3f",
-                sqr(ic->sh_invrc6), ic->sh_invrc6);
+        real dispersion_shift;
+
+        dispersion_shift = ic->dispersion_shift.cpot;
+        if (EVDW_PME(ic->vdwtype))
+        {
+            dispersion_shift -= ic->sh_lj_ewald;
+        }
+        fprintf(fp, "Potential shift: LJ r^-12: %.3e r^-6: %.3e",
+                ic->repulsion_shift.cpot, dispersion_shift);
+
          if (ic->eeltype == eelCUT)
          {
-            fprintf(fp, ", Coulomb %.3f", ic->c_rf);
+            fprintf(fp, ", Coulomb %.e", -ic->c_rf);
          }
          else if (EEL_PME(ic->eeltype))
          {
-            fprintf(fp, ", Ewald %.3e", ic->sh_ewald);
+            fprintf(fp, ", Ewald %.3e", -ic->sh_ewald);
          }
          fprintf(fp, "\n");
      }
@@ -1862,6 +2100,25 @@ void init_interaction_const(FILE                 *fp,
      if (fr->nbv != NULL && fr->nbv->bUseGPU)
      {
          nbnxn_cuda_init_const(fr->nbv->cu_nbv, ic, fr->nbv->grp);
+
+        /* With tMPI + GPUs some ranks may be sharing GPU(s) and therefore
+         * also sharing texture references. To keep the code simple, we don't
+         * treat texture references as shared resources, but this means that
+         * the coulomb_tab and nbfp texture refs will get updated by multiple threads.
+         * Hence, to ensure that the non-bonded kernels don't start before all
+         * texture binding operations are finished, we need to wait for all ranks
+         * to arrive here before continuing.
+         *
+         * Note that we could omit this barrier if GPUs are not shared (or
+         * texture objects are used), but as this is initialization code, there
+         * is not point in complicating things.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif  /* GMX_THREAD_MPI */
      }
  
      bUsesSimpleTables = uses_simple_tables(fr->cutoff_scheme, fr->nbv, -1);
@@ -1870,6 +2127,7 @@ void init_interaction_const(FILE                 *fp,
  
  static void init_nb_verlet(FILE                *fp,
                             nonbonded_verlet_t **nb_verlet,
+                           gmx_bool             bFEP_NonBonded,
                             const t_inputrec    *ir,
                             const t_forcerec    *fr,
                             const t_commrec     *cr,
@@ -1888,7 +2146,8 @@ static void init_nb_verlet(FILE                *fp,
      pick_nbnxn_resources(cr, fr->hwinfo,
                           fr->bNonbonded,
                           &nbv->bUseGPU,
-                         &bEmulateGPU);
+                         &bEmulateGPU,
+                         fr->gpu_opt);
  
      nbv->nbs = NULL;
  
@@ -1901,7 +2160,7 @@ static void init_nb_verlet(FILE                *fp,
  
          if (i == 0) /* local */
          {
-            pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+            pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
                                nbv->bUseGPU, bEmulateGPU, ir,
                                &nbv->grp[i].kernel_type,
                                &nbv->grp[i].ewald_excl,
@@ -1912,7 +2171,7 @@ static void init_nb_verlet(FILE                *fp,
              if (nbpu_opt != NULL && strcmp(nbpu_opt, "gpu_cpu") == 0)
              {
                  /* Use GPU for local, select a CPU kernel for non-local */
-                pick_nbnxn_kernel(fp, cr, fr->use_cpu_acceleration,
+                pick_nbnxn_kernel(fp, cr, fr->use_simd_kernels,
                                    FALSE, FALSE, ir,
                                    &nbv->grp[i].kernel_type,
                                    &nbv->grp[i].ewald_excl,
@@ -1934,7 +2193,8 @@ static void init_nb_verlet(FILE                *fp,
          /* init the NxN GPU data; the last argument tells whether we'll have
           * both local and non-local NB calculation on GPU */
          nbnxn_cuda_init(fp, &nbv->cu_nbv,
-                        &fr->hwinfo->gpu_info, cr->rank_pp_intranode,
+                        &fr->hwinfo->gpu_info, fr->gpu_opt,
+                        cr->rank_pp_intranode,
                          (nbv->ngrp > 1) && !bHybridGPURun);
  
          if ((env = getenv("GMX_NB_MIN_CI")) != NULL)
@@ -1973,7 +2233,8 @@ static void init_nb_verlet(FILE                *fp,
      nbnxn_init_search(&nbv->nbs,
                        DOMAINDECOMP(cr) ? &cr->dd->nc : NULL,
                        DOMAINDECOMP(cr) ? domdec_zones(cr->dd) : NULL,
-                      gmx_omp_nthreads_get(emntNonbonded));
+                      bFEP_NonBonded,
+                      gmx_omp_nthreads_get(emntPairsearch));
  
      for (i = 0; i < nbv->ngrp; i++)
      {
@@ -1997,13 +2258,43 @@ static void init_nb_verlet(FILE                *fp,
          if (i == 0 ||
              nbv->grp[0].kernel_type != nbv->grp[i].kernel_type)
          {
+            gmx_bool bSimpleList;
+            int      enbnxninitcombrule;
+
+            bSimpleList = nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type);
+
+            if (bSimpleList && (fr->vdwtype == evdwCUT && (fr->vdw_modifier == eintmodNONE || fr->vdw_modifier == eintmodPOTSHIFT)))
+            {
+                /* Plain LJ cut-off: we can optimize with combination rules */
+                enbnxninitcombrule = enbnxninitcombruleDETECT;
+            }
+            else if (fr->vdwtype == evdwPME)
+            {
+                /* LJ-PME: we need to use a combination rule for the grid */
+                if (fr->ljpme_combination_rule == eljpmeGEOM)
+                {
+                    enbnxninitcombrule = enbnxninitcombruleGEOM;
+                }
+                else
+                {
+                    enbnxninitcombrule = enbnxninitcombruleLB;
+                }
+            }
+            else
+            {
+                /* We use a full combination matrix: no rule required */
+                enbnxninitcombrule = enbnxninitcombruleNONE;
+            }
+
+
              snew(nbv->grp[i].nbat, 1);
              nbnxn_atomdata_init(fp,
                                  nbv->grp[i].nbat,
                                  nbv->grp[i].kernel_type,
+                                enbnxninitcombrule,
                                  fr->ntype, fr->nbfp,
                                  ir->opts.ngener,
-                                nbnxn_kernel_pairlist_simple(nbv->grp[i].kernel_type) ? gmx_omp_nthreads_get(emntNonbonded) : 1,
+                                bSimpleList ? gmx_omp_nthreads_get(emntNonbonded) : 1,
                                  nb_alloc, nb_free);
          }
          else
@@ -2033,10 +2324,10 @@ void init_forcerec(FILE              *fp,
      real           rtab;
      char          *env;
      double         dbl;
-    rvec           box_size;
      const t_block *cgs;
      gmx_bool       bGenericKernelOnly;
-    gmx_bool       bTab, bSep14tab, bNormalnblists;
+    gmx_bool       bMakeTables, bMakeSeparate14Table, bSomeNormalNbListsAreInUse;
+    gmx_bool       bFEP_NonBonded;
      t_nblists     *nbl;
      int           *nm_ind, egp_flags;
  
@@ -2046,11 +2337,11 @@ void init_forcerec(FILE              *fp,
           * In mdrun, hwinfo has already been set before calling init_forcerec.
           * Here we ignore GPUs, as tools will not use them anyhow.
           */
-        fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE, FALSE, NULL);
+        fr->hwinfo = gmx_detect_hardware(fp, cr, FALSE);
      }
  
-    /* By default we turn acceleration on, but it might be turned off further down... */
-    fr->use_cpu_acceleration = TRUE;
+    /* By default we turn SIMD kernels on, but it might be turned off further down... */
+    fr->use_simd_kernels = TRUE;
  
      fr->bDomDec = DOMAINDECOMP(cr);
  
@@ -2188,14 +2479,14 @@ void init_forcerec(FILE              *fp,
          bNoSolvOpt         = TRUE;
      }
  
-    if ( (getenv("GMX_DISABLE_CPU_ACCELERATION") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
+    if ( (getenv("GMX_DISABLE_SIMD_KERNELS") != NULL) || (getenv("GMX_NOOPTIMIZEDKERNELS") != NULL) )
      {
-        fr->use_cpu_acceleration = FALSE;
+        fr->use_simd_kernels = FALSE;
          if (fp != NULL)
          {
              fprintf(fp,
-                    "\nFound environment variable GMX_DISABLE_CPU_ACCELERATION.\n"
-                    "Disabling all CPU architecture-specific (e.g. SSE2/SSE4/AVX) routines.\n\n");
+                    "\nFound environment variable GMX_DISABLE_SIMD_KERNELS.\n"
+                    "Disabling the usage of any SIMD-specific kernel routines (e.g. SSE2/SSE4.1/AVX).\n\n");
          }
      }
  
@@ -2207,12 +2498,12 @@ void init_forcerec(FILE              *fp,
      fr->AllvsAll_workgb = NULL;
  
      /* All-vs-all kernels have not been implemented in 4.6, and
-     * the SIMD group kernels are also buggy in this case. Non-accelerated
+     * the SIMD group kernels are also buggy in this case. Non-SIMD
       * group kernels are OK. See Redmine #1249. */
      if (fr->bAllvsAll)
      {
          fr->bAllvsAll            = FALSE;
-        fr->use_cpu_acceleration = FALSE;
+        fr->use_simd_kernels     = FALSE;
          if (fp != NULL)
          {
              fprintf(fp,
@@ -2231,6 +2522,21 @@ void init_forcerec(FILE              *fp,
      fr->bGrid         = (ir->ns_type == ensGRID);
      fr->ePBC          = ir->ePBC;
  
+    if (fr->cutoff_scheme == ecutsGROUP)
+    {
+        const char *note = "NOTE: This file uses the deprecated 'group' cutoff_scheme. This will be\n"
+            "removed in a future release when 'verlet' supports all interaction forms.\n";
+
+        if (MASTER(cr))
+        {
+            fprintf(stderr, "\n%s\n", note);
+        }
+        if (fp != NULL)
+        {
+            fprintf(fp, "\n%s\n", note);
+        }
+    }
+
      /* Determine if we will do PBC for distances in bonded interactions */
      if (fr->ePBC == epbcNONE)
      {
@@ -2273,10 +2579,11 @@ void init_forcerec(FILE              *fp,
      fr->rc_scaling = ir->refcoord_scaling;
      copy_rvec(ir->posres_com, fr->posres_com);
      copy_rvec(ir->posres_comB, fr->posres_comB);
-    fr->rlist      = cutoff_inf(ir->rlist);
-    fr->rlistlong  = cutoff_inf(ir->rlistlong);
-    fr->eeltype    = ir->coulombtype;
-    fr->vdwtype    = ir->vdwtype;
+    fr->rlist                    = cutoff_inf(ir->rlist);
+    fr->rlistlong                = cutoff_inf(ir->rlistlong);
+    fr->eeltype                  = ir->coulombtype;
+    fr->vdwtype                  = ir->vdwtype;
+    fr->ljpme_combination_rule   = ir->ljpme_combination_rule;
  
      fr->coulomb_modifier = ir->coulomb_modifier;
      fr->vdw_modifier     = ir->vdw_modifier;
@@ -2332,6 +2639,9 @@ void init_forcerec(FILE              *fp,
                  fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LENNARDJONES;
              }
              break;
+        case evdwPME:
+            fr->nbkernel_vdw_interaction = GMX_NBKERNEL_VDW_LJEWALD;
+            break;
  
          case evdwSWITCH:
          case evdwSHIFT:
@@ -2349,6 +2659,11 @@ void init_forcerec(FILE              *fp,
      fr->nbkernel_elec_modifier    = fr->coulomb_modifier;
      fr->nbkernel_vdw_modifier     = fr->vdw_modifier;
  
+    fr->rvdw             = cutoff_inf(ir->rvdw);
+    fr->rvdw_switch      = ir->rvdw_switch;
+    fr->rcoulomb         = cutoff_inf(ir->rcoulomb);
+    fr->rcoulomb_switch  = ir->rcoulomb_switch;
+
      fr->bTwinRange = fr->rlistlong > fr->rlist;
      fr->bEwald     = (EEL_PME(fr->eeltype) || fr->eeltype == eelEWALD);
  
@@ -2356,8 +2671,8 @@ void init_forcerec(FILE              *fp,
  
      if (ir->cutoff_scheme == ecutsGROUP)
      {
-        fr->bvdwtab    = (fr->vdwtype != evdwCUT ||
-                          !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS));
+        fr->bvdwtab    = ((fr->vdwtype != evdwCUT || !gmx_within_tol(fr->reppow, 12.0, 10*GMX_DOUBLE_EPS))
+                          && !EVDW_PME(fr->vdwtype));
          /* We have special kernels for standard Ewald and PME, but the pme-switch ones are tabulated above */
          fr->bcoultab   = !(fr->eeltype == eelCUT ||
                             fr->eeltype == eelEWALD ||
@@ -2367,12 +2682,18 @@ void init_forcerec(FILE              *fp,
  
          /* If the user absolutely wants different switch/shift settings for coul/vdw, it is likely
           * going to be faster to tabulate the interaction than calling the generic kernel.
+         * However, if generic kernels have been requested we keep things analytically.
           */
-        if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH && fr->nbkernel_vdw_modifier == eintmodPOTSWITCH)
+        if (fr->nbkernel_elec_modifier == eintmodPOTSWITCH &&
+            fr->nbkernel_vdw_modifier == eintmodPOTSWITCH &&
+            bGenericKernelOnly == FALSE)
          {
              if ((fr->rcoulomb_switch != fr->rvdw_switch) || (fr->rcoulomb != fr->rvdw))
              {
                  fr->bcoultab = TRUE;
+                /* Once we tabulate electrostatics, we can use the switch function for LJ,
+                 * which would otherwise need two tables.
+                 */
              }
          }
          else if ((fr->nbkernel_elec_modifier == eintmodPOTSHIFT && fr->nbkernel_vdw_modifier == eintmodPOTSHIFT) ||
@@ -2380,12 +2701,21 @@ void init_forcerec(FILE              *fp,
                     fr->nbkernel_elec_modifier == eintmodEXACTCUTOFF &&
                     (fr->nbkernel_vdw_modifier == eintmodPOTSWITCH || fr->nbkernel_vdw_modifier == eintmodPOTSHIFT))))
          {
-            if (fr->rcoulomb != fr->rvdw)
+            if ((fr->rcoulomb != fr->rvdw) && (bGenericKernelOnly == FALSE))
              {
                  fr->bcoultab = TRUE;
              }
          }
  
+        if (fr->nbkernel_elec_modifier == eintmodFORCESWITCH)
+        {
+            fr->bcoultab = TRUE;
+        }
+        if (fr->nbkernel_vdw_modifier == eintmodFORCESWITCH)
+        {
+            fr->bvdwtab = TRUE;
+        }
+
          if (getenv("GMX_REQUIRE_TABLES"))
          {
              fr->bvdwtab  = TRUE;
@@ -2427,7 +2757,7 @@ void init_forcerec(FILE              *fp,
          {
              if (fp)
              {
-                fprintf(fp, "Will do PME sum in reciprocal space.\n");
+                fprintf(fp, "Will do PME sum in reciprocal space for electrostatic interactions.\n");
              }
              if (ir->coulombtype == eelP3M_AD)
              {
@@ -2448,12 +2778,27 @@ void init_forcerec(FILE              *fp,
                  please_cite(fp, "In-Chul99a");
              }
          }
-        fr->ewaldcoeff = calc_ewaldcoeff(ir->rcoulomb, ir->ewald_rtol);
+        fr->ewaldcoeff_q = calc_ewaldcoeff_q(ir->rcoulomb, ir->ewald_rtol);
          init_ewald_tab(&(fr->ewald_table), ir, fp);
          if (fp)
          {
              fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for Ewald\n",
-                    1/fr->ewaldcoeff);
+                    1/fr->ewaldcoeff_q);
+        }
+    }
+
+    if (EVDW_PME(ir->vdwtype))
+    {
+        if (fp)
+        {
+            fprintf(fp, "Will do PME sum in reciprocal space for LJ dispersion interactions.\n");
+        }
+        please_cite(fp, "Essmann95a");
+        fr->ewaldcoeff_lj = calc_ewaldcoeff_lj(ir->rvdw, ir->ewald_rtol_lj);
+        if (fp)
+        {
+            fprintf(fp, "Using a Gaussian width (1/beta) of %g nm for LJ Ewald\n",
+                    1/fr->ewaldcoeff_lj);
          }
      }
  
@@ -2461,8 +2806,6 @@ void init_forcerec(FILE              *fp,
      fr->epsilon_r       = ir->epsilon_r;
      fr->epsilon_rf      = ir->epsilon_rf;
      fr->fudgeQQ         = mtop->ffparams.fudgeQQ;
-    fr->rcoulomb_switch = ir->rcoulomb_switch;
-    fr->rcoulomb        = cutoff_inf(ir->rcoulomb);
  
      /* Parameters for generalized RF */
      fr->zsquare = 0.0;
@@ -2472,20 +2815,8 @@ void init_forcerec(FILE              *fp,
      {
          init_generalized_rf(fp, mtop, ir, fr);
      }
-    else if (fr->eeltype == eelSHIFT)
-    {
-        for (m = 0; (m < DIM); m++)
-        {
-            box_size[m] = box[m][m];
-        }
-
-        if ((fr->eeltype == eelSHIFT && fr->rcoulomb > fr->rcoulomb_switch))
-        {
-            set_shift_consts(fr->rcoulomb_switch, fr->rcoulomb, box_size);
-        }
-    }
  
-    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) ||
+    fr->bF_NoVirSum = (EEL_FULL(fr->eeltype) || EVDW_PME(fr->vdwtype) ||
                         gmx_mtop_ftype_count(mtop, F_POSRES) > 0 ||
                         gmx_mtop_ftype_count(mtop, F_FBPOSRES) > 0 ||
                         IR_ELEC_FIELD(*ir) ||
@@ -2513,14 +2844,16 @@ void init_forcerec(FILE              *fp,
      {
          fr->ntype = mtop->ffparams.atnr;
          fr->nbfp  = mk_nbfp(&mtop->ffparams, fr->bBHAM);
+        if (EVDW_PME(fr->vdwtype))
+        {
+            fr->ljpme_c6grid  = make_ljpme_c6grid(&mtop->ffparams, fr);
+        }
      }
  
      /* Copy the energy group exclusions */
      fr->egp_flags = ir->opts.egp_flags;
  
      /* Van der Waals stuff */
-    fr->rvdw        = cutoff_inf(ir->rvdw);
-    fr->rvdw_switch = ir->rvdw_switch;
      if ((fr->vdwtype != evdwCUT) && (fr->vdwtype != evdwUSER) && !fr->bBHAM)
      {
          if (fr->rvdw_switch >= fr->rvdw)
@@ -2536,11 +2869,21 @@ void init_forcerec(FILE              *fp,
          }
      }
  
+    if (fr->bBHAM && EVDW_PME(fr->vdwtype))
+    {
+        gmx_fatal(FARGS, "LJ PME not supported with Buckingham");
+    }
+
      if (fr->bBHAM && (fr->vdwtype == evdwSHIFT || fr->vdwtype == evdwSWITCH))
      {
          gmx_fatal(FARGS, "Switch/shift interaction not supported with Buckingham");
      }
  
+    if (fr->bBHAM && fr->cutoff_scheme == ecutsVERLET)
+    {
+        gmx_fatal(FARGS, "Verlet cutoff-scheme is not supported with Buckingham");
+    }
+
      if (fp)
      {
          fprintf(fp, "Cut-off's:   NS: %g   Coulomb: %g   %s: %g\n",
@@ -2605,7 +2948,7 @@ void init_forcerec(FILE              *fp,
          fr->gbtabr = 100;
          fr->gbtab  = make_gb_table(oenv, fr);
  
-        init_gb(&fr->born, cr, fr, ir, mtop, ir->gb_algorithm);
+        init_gb(&fr->born, fr, ir, mtop, ir->gb_algorithm);
  
          /* Copy local gb data (for dd, this is done in dd_partition_system) */
          if (!DOMAINDECOMP(cr))
@@ -2633,6 +2976,7 @@ void init_forcerec(FILE              *fp,
                     &fr->kappa, &fr->k_rf, &fr->c_rf);
      }
  
+    /*This now calculates sum for q and c6*/
      set_chargesum(fp, fr, mtop);
  
      /* if we are using LR electrostatics, and they are tabulated,
@@ -2645,24 +2989,27 @@ void init_forcerec(FILE              *fp,
       * A little unnecessary to make both vdw and coul tables sometimes,
       * but what the heck... */
  
-    bTab = fr->bcoultab || fr->bvdwtab || fr->bEwald;
+    bMakeTables = fr->bcoultab || fr->bvdwtab || fr->bEwald ||
+        (ir->eDispCorr != edispcNO && ir_vdw_switched(ir));
  
-    bSep14tab = ((!bTab || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
-                  fr->bBHAM || fr->bEwald) &&
-                 (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
-                  gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
-                  gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
+    bMakeSeparate14Table = ((!bMakeTables || fr->eeltype != eelCUT || fr->vdwtype != evdwCUT ||
+                             fr->coulomb_modifier != eintmodNONE ||
+                             fr->vdw_modifier != eintmodNONE ||
+                             fr->bBHAM || fr->bEwald) &&
+                            (gmx_mtop_ftype_count(mtop, F_LJ14) > 0 ||
+                             gmx_mtop_ftype_count(mtop, F_LJC14_Q) > 0 ||
+                             gmx_mtop_ftype_count(mtop, F_LJC_PAIRS_NB) > 0));
  
      negp_pp   = ir->opts.ngener - ir->nwall;
      negptable = 0;
-    if (!bTab)
+    if (!bMakeTables)
      {
-        bNormalnblists = TRUE;
-        fr->nnblists   = 1;
+        bSomeNormalNbListsAreInUse = TRUE;
+        fr->nnblists               = 1;
      }
      else
      {
-        bNormalnblists = (ir->eDispCorr != edispcNO);
+        bSomeNormalNbListsAreInUse = (ir->eDispCorr != edispcNO);
          for (egi = 0; egi < negp_pp; egi++)
          {
              for (egj = egi; egj < negp_pp; egj++)
@@ -2676,12 +3023,12 @@ void init_forcerec(FILE              *fp,
                      }
                      else
                      {
-                        bNormalnblists = TRUE;
+                        bSomeNormalNbListsAreInUse = TRUE;
                      }
                  }
              }
          }
-        if (bNormalnblists)
+        if (bSomeNormalNbListsAreInUse)
          {
              fr->nnblists = negptable + 1;
          }
@@ -2708,17 +3055,17 @@ void init_forcerec(FILE              *fp,
       */
      rtab = ir->rlistlong + ir->tabext;
  
-    if (bTab)
+    if (bMakeTables)
      {
          /* make tables for ordinary interactions */
-        if (bNormalnblists)
+        if (bSomeNormalNbListsAreInUse)
          {
              make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
              if (ir->adress)
              {
                  make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[fr->nnblists/2]);
              }
-            if (!bSep14tab)
+            if (!bMakeSeparate14Table)
              {
                  fr->tab14 = fr->nblists[0].table_elec_vdw;
              }
@@ -2766,7 +3113,18 @@ void init_forcerec(FILE              *fp,
              }
          }
      }
-    if (bSep14tab)
+    else if ((fr->eDispCorr != edispcNO) &&
+             ((fr->vdw_modifier == eintmodPOTSWITCH) ||
+              (fr->vdw_modifier == eintmodFORCESWITCH) ||
+              (fr->vdw_modifier == eintmodPOTSHIFT)))
+    {
+        /* Tables might not be used for the potential modifier interactions per se, but
+         * we still need them to evaluate switch/shift dispersion corrections in this case.
+         */
+        make_nbf_tables(fp, oenv, fr, rtab, cr, tabfn, NULL, NULL, &fr->nblists[0]);
+    }
+
+    if (bMakeSeparate14Table)
      {
          /* generate extra tables with plain Coulomb for 1-4 interactions only */
          fr->tab14 = make_tables(fp, oenv, fr, MASTER(cr), tabpfn, rtab,
@@ -2830,6 +3188,7 @@ void init_forcerec(FILE              *fp,
  
      /* Set all the static charge group info */
      fr->cginfo_mb = init_cginfo_mb(fp, mtop, fr, bNoSolvOpt,
+                                   &bFEP_NonBonded,
                                     &fr->bExcl_IntraCGAll_InterCGNone);
      if (DOMAINDECOMP(cr))
      {
@@ -2842,9 +3201,6 @@ void init_forcerec(FILE              *fp,
  
      if (!DOMAINDECOMP(cr))
      {
-        /* When using particle decomposition, the effect of the second argument,
-         * which sets fr->hcg, is corrected later in do_md and init_em.
-         */
          forcerec_set_ranges(fr, ncg_mtop(mtop), ncg_mtop(mtop),
                              mtop->natoms, mtop->natoms, mtop->natoms);
      }
@@ -2883,11 +3239,12 @@ void init_forcerec(FILE              *fp,
              gmx_fatal(FARGS, "With Verlet lists rcoulomb and rvdw should be identical");
          }
  
-        init_nb_verlet(fp, &fr->nbv, ir, fr, cr, nbpu_opt);
+        init_nb_verlet(fp, &fr->nbv, bFEP_NonBonded, ir, fr, cr, nbpu_opt);
      }
  
      /* fr->ic is used both by verlet and group kernels (to some extent) now */
-    init_interaction_const(fp, &fr->ic, fr, rtab);
+    init_interaction_const(fp, cr, &fr->ic, fr, rtab);
+
      if (ir->eDispCorr != edispcNO)
      {
          calc_enervirdiff(fp, ir->eDispCorr, fr);
@@ -2919,22 +3276,12 @@ void pr_forcerec(FILE *fp, t_forcerec *fr)
      fflush(fp);
  }
  
-void forcerec_set_excl_load(t_forcerec *fr,
-                            const gmx_localtop_t *top, const t_commrec *cr)
+void forcerec_set_excl_load(t_forcerec           *fr,
+                            const gmx_localtop_t *top)
  {
      const int *ind, *a;
      int        t, i, j, ntot, n, ntarget;
  
-    if (cr != NULL && PARTDECOMP(cr))
-    {
-        /* No OpenMP with particle decomposition */
-        pd_at_range(cr,
-                    &fr->excl_load[0],
-                    &fr->excl_load[1]);
-
-        return;
-    }
-
      ind = top->excls.index;
      a   = top->excls.a;
  
@@ -2970,3 +3317,44 @@ void forcerec_set_excl_load(t_forcerec *fr,
          fr->excl_load[t] = i;
      }
  }
+
+/* Frees GPU memory and destroys the CUDA context.
+ *
+ * Note that this function needs to be called even if GPUs are not used
+ * in this run because the PME ranks have no knowledge of whether GPUs
+ * are used or not, but all ranks need to enter the barrier below.
+ */
+void free_gpu_resources(const t_forcerec *fr,
+                        const t_commrec  *cr)
+{
+    gmx_bool bIsPPrankUsingGPU;
+    char     gpu_err_str[STRLEN];
+
+    bIsPPrankUsingGPU = (cr->duty & DUTY_PP) && fr && fr->nbv && fr->nbv->bUseGPU;
+
+    if (bIsPPrankUsingGPU)
+    {
+        /* free nbnxn data in GPU memory */
+        nbnxn_cuda_free(fr->nbv->cu_nbv);
+
+        /* With tMPI we need to wait for all ranks to finish deallocation before
+         * destroying the context in free_gpu() as some ranks may be sharing
+         * GPU and context.
+         * Note: as only PP ranks need to free GPU resources, so it is safe to
+         * not call the barrier on PME ranks.
+         */
+#ifdef GMX_THREAD_MPI
+        if (PAR(cr))
+        {
+            gmx_barrier(cr);
+        }
+#endif  /* GMX_THREAD_MPI */
+
+        /* uninitialize GPU (by destroying the context) */
+        if (!free_gpu(gpu_err_str))
+        {
+            gmx_warning("On rank %d failed to free GPU #%d: %s",
+                        cr->nodeid, get_current_gpu_device_id(), gpu_err_str);
+        }
+    }
+}