Minor performance improments

[alexxy/gromacs.git] / src / gromacs / gmxlib / nonbonded / nb_free_energy.c
diff --git a/src/gromacs/gmxlib/nonbonded/nb_free_energy.c b/src/gromacs/gmxlib/nonbonded/nb_free_energy.c

index 5414c668c6936a22a2238e2a079fba88c91e34ae..6b5d6d89dcae8b49ee779c473893c399e7fd8bce 100644 (file)
--- a/src/gromacs/gmxlib/nonbonded/nb_free_energy.c
+++ b/src/gromacs/gmxlib/nonbonded/nb_free_energy.c
@@ -34,20 +34,18 @@
   * To help us fund GROMACS development, we humbly ask that you cite
   * the research papers on the package. Check out http://www.gromacs.org.
   */
-#ifdef HAVE_CONFIG_H
-#include <config.h>
-#endif
+#include "gmxpre.h"
+
+#include "nb_free_energy.h"
  
  #include <math.h>
  
+#include "gromacs/gmxlib/nonbonded/nb_kernel.h"
+#include "gromacs/legacyheaders/macros.h"
+#include "gromacs/legacyheaders/nonbonded.h"
+#include "gromacs/legacyheaders/nrnb.h"
+#include "gromacs/legacyheaders/typedefs.h"
  #include "gromacs/math/vec.h"
-#include "typedefs.h"
-#include "nonbonded.h"
-#include "nb_kernel.h"
-#include "nrnb.h"
-#include "macros.h"
-#include "nb_free_energy.h"
-
  #include "gromacs/utility/fatalerror.h"
  
  void
@@ -65,8 +63,9 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
  #define  NSTATES  2
      int           i, j, n, ii, is3, ii3, k, nj0, nj1, jnr, j3, ggid;
      real          shX, shY, shZ;
-    real          Fscal, FscalC[NSTATES], FscalV[NSTATES], tx, ty, tz;
-    real          Vcoul[NSTATES], Vvdw[NSTATES];
+    real          tx, ty, tz, Fscal;
+    double        FscalC[NSTATES], FscalV[NSTATES];  /* Needs double for sc_power==48 */
+    double        Vcoul[NSTATES], Vvdw[NSTATES];     /* Needs double for sc_power==48 */
      real          rinv6, r, rt, rtC, rtV;
      real          iqA, iqB;
      real          qq[NSTATES], vctot, krsq;
@@ -74,12 +73,12 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
      real          Vvdw6, Vvdw12, vvtot;
      real          ix, iy, iz, fix, fiy, fiz;
      real          dx, dy, dz, rsq, rinv;
-    real          c6[NSTATES], c12[NSTATES], c6grid[NSTATES];
+    real          c6[NSTATES], c12[NSTATES], c6grid;
      real          LFC[NSTATES], LFV[NSTATES], DLF[NSTATES];
      double        dvdl_coul, dvdl_vdw;
      real          lfac_coul[NSTATES], dlfac_coul[NSTATES], lfac_vdw[NSTATES], dlfac_vdw[NSTATES];
      real          sigma6[NSTATES], alpha_vdw_eff, alpha_coul_eff, sigma2_def, sigma2_min;
-    real          rp, rpm2, rC, rV, rinvC, rpinvC, rinvV, rpinvV;
+    double        rp, rpm2, rC, rV, rinvC, rpinvC, rinvV, rpinvV; /* Needs double for sc_power==48 */
      real          sigma2[NSTATES], sigma_pow[NSTATES], sigma_powm2[NSTATES], rs, rs2;
      int           do_tab, tab_elemsize;
      int           n0, n1C, n1V, nnn;
@@ -106,21 +105,43 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
      const real *  chargeB;
      real          sigma6_min, sigma6_def, lam_power, sc_power, sc_r_power;
      real          alpha_coul, alpha_vdw, lambda_coul, lambda_vdw, ewc_lj;
+    real          ewcljrsq, ewclj, ewclj2, exponent, poly, vvdw_disp, vvdw_rep, sh_lj_ewald;
+    real          ewclj6;
      const real *  nbfp, *nbfp_grid;
      real *        dvdl;
      real *        Vv;
      real *        Vc;
      gmx_bool      bDoForces, bDoShiftForces, bDoPotential;
-    real          rcoulomb, sh_ewald;
-    real          rvdw, sh_invrc6;
-    gmx_bool      bExactElecCutoff, bExactVdwCutoff, bExactCutoffAll, bEwald;
+    real          rcoulomb, rvdw, sh_invrc6;
+    gmx_bool      bExactElecCutoff, bExactVdwCutoff, bExactCutoffAll;
+    gmx_bool      bEwald, bEwaldLJ;
      real          rcutoff_max2;
-    real          rcutoff, rcutoff2, rswitch, d, d2, swV3, swV4, swV5, swF2, swF3, swF4, sw, dsw, rinvcorr;
-    const real *  tab_ewald_F;
-    const real *  tab_ewald_V;
      const real *  tab_ewald_F_lj;
      const real *  tab_ewald_V_lj;
-    real          tab_ewald_scale, tab_ewald_halfsp;
+    real          d, d2, sw, dsw, rinvcorr;
+    real          elec_swV3, elec_swV4, elec_swV5, elec_swF2, elec_swF3, elec_swF4;
+    real          vdw_swV3, vdw_swV4, vdw_swV5, vdw_swF2, vdw_swF3, vdw_swF4;
+    gmx_bool      bConvertEwaldToCoulomb, bConvertLJEwaldToLJ6;
+    gmx_bool      bComputeVdwInteraction, bComputeElecInteraction;
+    const real *  ewtab;
+    int           ewitab;
+    real          ewrt, eweps, ewtabscale, ewtabhalfspace, sh_ewald;
+
+    const real    onetwelfth  = 1.0/12.0;
+    const real    onesixth    = 1.0/6.0;
+    const real    zero        = 0.0;
+    const real    half        = 0.5;
+    const real    one         = 1.0;
+    const real    two         = 2.0;
+    const real    six         = 6.0;
+    const real    fourtyeight = 48.0;
+
+    sh_ewald            = fr->ic->sh_ewald;
+    ewtab               = fr->ic->tabq_coul_FDV0;
+    ewtabscale          = fr->ic->tabq_scale;
+    ewtabhalfspace      = half/ewtabscale;
+    tab_ewald_F_lj      = fr->ic->tabq_vdw_F;
+    tab_ewald_V_lj      = fr->ic->tabq_vdw_V;
  
      x                   = xx[0];
      f                   = ff[0];
@@ -164,41 +185,43 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
      bDoPotential        = kernel_data->flags & GMX_NONBONDED_DO_POTENTIAL;
  
      rcoulomb            = fr->rcoulomb;
-    sh_ewald            = fr->ic->sh_ewald;
      rvdw                = fr->rvdw;
      sh_invrc6           = fr->ic->sh_invrc6;
+    sh_lj_ewald         = fr->ic->sh_lj_ewald;
+    ewclj               = fr->ewaldcoeff_lj;
+    ewclj2              = ewclj*ewclj;
+    ewclj6              = ewclj2*ewclj2*ewclj2;
  
-    /* Ewald (PME) reciprocal force and energy quadratic spline tables */
-    tab_ewald_F         = fr->ic->tabq_coul_F;
-    tab_ewald_V         = fr->ic->tabq_coul_V;
-    tab_ewald_scale     = fr->ic->tabq_scale;
-    tab_ewald_F_lj      = fr->ic->tabq_vdw_F;
-    tab_ewald_V_lj      = fr->ic->tabq_vdw_V;
-    tab_ewald_halfsp    = 0.5/tab_ewald_scale;
+    if (fr->coulomb_modifier == eintmodPOTSWITCH)
+    {
+        d               = fr->rcoulomb-fr->rcoulomb_switch;
+        elec_swV3       = -10.0/(d*d*d);
+        elec_swV4       =  15.0/(d*d*d*d);
+        elec_swV5       =  -6.0/(d*d*d*d*d);
+        elec_swF2       = -30.0/(d*d*d);
+        elec_swF3       =  60.0/(d*d*d*d);
+        elec_swF4       = -30.0/(d*d*d*d*d);
+    }
+    else
+    {
+        /* Avoid warnings from stupid compilers (looking at you, Clang!) */
+        elec_swV3 = elec_swV4 = elec_swV5 = elec_swF2 = elec_swF3 = elec_swF4 = 0.0;
+    }
  
-    if (fr->coulomb_modifier == eintmodPOTSWITCH || fr->vdw_modifier == eintmodPOTSWITCH)
+    if (fr->vdw_modifier == eintmodPOTSWITCH)
      {
-        rcutoff         = (fr->coulomb_modifier == eintmodPOTSWITCH) ? fr->rcoulomb : fr->rvdw;
-        rcutoff2        = rcutoff*rcutoff;
-        rswitch         = (fr->coulomb_modifier == eintmodPOTSWITCH) ? fr->rcoulomb_switch : fr->rvdw_switch;
-        d               = rcutoff-rswitch;
-        swV3            = -10.0/(d*d*d);
-        swV4            =  15.0/(d*d*d*d);
-        swV5            =  -6.0/(d*d*d*d*d);
-        swF2            = -30.0/(d*d*d);
-        swF3            =  60.0/(d*d*d*d);
-        swF4            = -30.0/(d*d*d*d*d);
+        d               = fr->rvdw-fr->rvdw_switch;
+        vdw_swV3        = -10.0/(d*d*d);
+        vdw_swV4        =  15.0/(d*d*d*d);
+        vdw_swV5        =  -6.0/(d*d*d*d*d);
+        vdw_swF2        = -30.0/(d*d*d);
+        vdw_swF3        =  60.0/(d*d*d*d);
+        vdw_swF4        = -30.0/(d*d*d*d*d);
      }
      else
      {
-        /* Stupid compilers dont realize these variables will not be used */
-        rswitch         = 0.0;
-        swV3            = 0.0;
-        swV4            = 0.0;
-        swV5            = 0.0;
-        swF2            = 0.0;
-        swF3            = 0.0;
-        swF4            = 0.0;
+        /* Avoid warnings from stupid compilers (looking at you, Clang!) */
+        vdw_swV3 = vdw_swV4 = vdw_swV5 = vdw_swF2 = vdw_swF3 = vdw_swF4 = 0.0;
      }
  
      if (fr->cutoff_scheme == ecutsVERLET)
@@ -242,6 +265,34 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
      rcutoff_max2    = rcutoff_max2*rcutoff_max2;
  
      bEwald          = (icoul == GMX_NBKERNEL_ELEC_EWALD);
+    bEwaldLJ        = (ivdw == GMX_NBKERNEL_VDW_LJEWALD);
+
+    /* For Ewald/PME interactions we cannot easily apply the soft-core component to
+     * reciprocal space. When we use vanilla (not switch/shift) Ewald interactions, we
+     * can apply the small trick of subtracting the _reciprocal_ space contribution
+     * in this kernel, and instead apply the free energy interaction to the 1/r
+     * (standard coulomb) interaction.
+     *
+     * However, we cannot use this approach for switch-modified since we would then
+     * effectively end up evaluating a significantly different interaction here compared to the
+     * normal (non-free-energy) kernels, either by applying a cutoff at a different
+     * position than what the user requested, or by switching different
+     * things (1/r rather than short-range Ewald). For these settings, we just
+     * use the traditional short-range Ewald interaction in that case.
+     */
+    bConvertEwaldToCoulomb = (bEwald && (fr->coulomb_modifier != eintmodPOTSWITCH));
+    /* For now the below will always be true (since LJ-PME only works with Shift in Gromacs-5.0),
+     * but writing it this way means we stay in sync with coulomb, and it avoids future bugs.
+     */
+    bConvertLJEwaldToLJ6   = (bEwaldLJ && (fr->vdw_modifier   != eintmodPOTSWITCH));
+
+    /* We currently don't implement exclusion correction, needed with the Verlet cut-off scheme, without conversion */
+    if (fr->cutoff_scheme == ecutsVERLET &&
+        ((bEwald   && !bConvertEwaldToCoulomb) ||
+         (bEwaldLJ && !bConvertLJEwaldToLJ6)))
+    {
+        gmx_incons("Unimplemented non-bonded setup");
+    }
  
      /* fix compiler warnings */
      nj1   = 0;
@@ -253,8 +304,8 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
      dvdl_vdw   = 0;
  
      /* Lambda factor for state A, 1-lambda*/
-    LFC[STATE_A] = 1.0 - lambda_coul;
-    LFV[STATE_A] = 1.0 - lambda_vdw;
+    LFC[STATE_A] = one - lambda_coul;
+    LFV[STATE_A] = one - lambda_vdw;
  
      /* Lambda factor for state B, lambda*/
      LFC[STATE_B] = lambda_coul;
@@ -349,12 +400,12 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                  r            = 0;
              }
  
-            if (sc_r_power == 6.0)
+            if (sc_r_power == six)
              {
                  rpm2             = rsq*rsq;  /* r4 */
                  rp               = rpm2*rsq; /* r6 */
              }
-            else if (sc_r_power == 48.0)
+            else if (sc_r_power == fourtyeight)
              {
                  rp               = rsq*rsq*rsq; /* r6 */
                  rp               = rp*rp;       /* r12 */
@@ -376,12 +427,6 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
              tj[STATE_A]      = ntiA+2*typeA[jnr];
              tj[STATE_B]      = ntiB+2*typeB[jnr];
  
-            if (ivdw == GMX_NBKERNEL_VDW_LJEWALD)
-            {
-                c6grid[STATE_A] = nbfp_grid[tj[STATE_A]];
-                c6grid[STATE_B] = nbfp_grid[tj[STATE_B]];
-            }
-
              if (nlist->excl_fep == NULL || nlist->excl_fep[k])
              {
                  c6[STATE_A]      = nbfp[tj[STATE_A]];
@@ -393,7 +438,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                      if ((c6[i] > 0) && (c12[i] > 0))
                      {
                          /* c12 is stored scaled with 12.0 and c6 is scaled with 6.0 - correct for this */
-                        sigma6[i]       = 0.5*c12[i]/c6[i];
+                        sigma6[i]       = half*c12[i]/c6[i];
                          sigma2[i]       = pow(sigma6[i], 1.0/3.0);
                          /* should be able to get rid of this ^^^ internal pow call eventually.  Will require agreement on
                             what data to store externally.  Can't be fixed without larger scale changes, so not 4.6 */
@@ -408,12 +453,12 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                          sigma6[i]       = sigma6_def;
                          sigma2[i]       = sigma2_def;
                      }
-                    if (sc_r_power == 6.0)
+                    if (sc_r_power == six)
                      {
                          sigma_pow[i]    = sigma6[i];
                          sigma_powm2[i]  = sigma6[i]/sigma2[i];
                      }
-                    else if (sc_r_power == 48.0)
+                    else if (sc_r_power == fourtyeight)
                      {
                          sigma_pow[i]    = sigma6[i]*sigma6[i];       /* sigma^12 */
                          sigma_pow[i]    = sigma_pow[i]*sigma_pow[i]; /* sigma^24 */
@@ -450,13 +495,13 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                      if ( (qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0) )
                      {
                          /* this section has to be inside the loop because of the dependence on sigma_pow */
-                        rpinvC         = 1.0/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp);
-                        rinvC          = pow(rpinvC, 1.0/sc_r_power);
-                        rC             = 1.0/rinvC;
+                        rpinvC         = one/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp);
+                        rinvC          = pow(rpinvC, one/sc_r_power);
+                        rC             = one/rinvC;
  
-                        rpinvV         = 1.0/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp);
-                        rinvV          = pow(rpinvV, 1.0/sc_r_power);
-                        rV             = 1.0/rinvV;
+                        rpinvV         = one/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp);
+                        rinvV          = pow(rpinvV, one/sc_r_power);
+                        rV             = one/rinvV;
  
                          if (do_tab)
                          {
@@ -473,14 +518,15 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                              n1V        = tab_elemsize*n0;
                          }
  
-                        /* With Ewald and soft-core we should put the cut-off on r,
-                         * not on the soft-cored rC, as the real-space and
-                         * reciprocal space contributions should (almost) cancel.
+                        /* Only process the coulomb interactions if we have charges,
+                         * and if we either include all entries in the list (no cutoff
+                         * used in the kernel), or if we are within the cutoff.
                           */
-                        if (qq[i] != 0 &&
-                            !(bExactElecCutoff &&
-                              ((!bEwald && rC >= rcoulomb) ||
-                               (bEwald && r >= rcoulomb))))
+                        bComputeElecInteraction = !bExactElecCutoff ||
+                            ( bConvertEwaldToCoulomb && r < rcoulomb) ||
+                            (!bConvertEwaldToCoulomb && rC < rcoulomb);
+
+                        if ( (qq[i] != 0) && bComputeElecInteraction)
                          {
                              switch (icoul)
                              {
@@ -488,18 +534,16 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                                      /* simple cutoff */
                                      Vcoul[i]   = qq[i]*rinvC;
                                      FscalC[i]  = Vcoul[i];
-                                    break;
-
-                                case GMX_NBKERNEL_ELEC_EWALD:
-                                    /* Ewald FEP is done only on the 1/r part */
-                                    Vcoul[i]   = qq[i]*(rinvC - sh_ewald);
-                                    FscalC[i]  = Vcoul[i];
+                                    /* The shift for the Coulomb potential is stored in
+                                     * the RF parameter c_rf, which is 0 without shift.
+                                     */
+                                    Vcoul[i]  -= qq[i]*fr->ic->c_rf;
                                      break;
  
                                  case GMX_NBKERNEL_ELEC_REACTIONFIELD:
                                      /* reaction-field */
                                      Vcoul[i]   = qq[i]*(rinvC + krf*rC*rC-crf);
-                                    FscalC[i]  = qq[i]*(rinvC - 2.0*krf*rC*rC);
+                                    FscalC[i]  = qq[i]*(rinvC - two*krf*rC*rC);
                                      break;
  
                                  case GMX_NBKERNEL_ELEC_CUBICSPLINETABLE:
@@ -511,7 +555,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                                      Heps2      = eps2C*VFtab[nnn+3];
                                      Fp         = F+Geps+Heps2;
                                      VV         = Y+epsC*Fp;
-                                    FF         = Fp+Geps+2.0*Heps2;
+                                    FF         = Fp+Geps+two*Heps2;
                                      Vcoul[i]   = qq[i]*VV;
                                      FscalC[i]  = -qq[i]*tabscale*FF*rC;
                                      break;
@@ -520,9 +564,29 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                                      gmx_fatal(FARGS, "Free energy and GB not implemented.\n");
                                      break;
  
+                                case GMX_NBKERNEL_ELEC_EWALD:
+                                    if (bConvertEwaldToCoulomb)
+                                    {
+                                        /* Ewald FEP is done only on the 1/r part */
+                                        Vcoul[i]   = qq[i]*(rinvC-sh_ewald);
+                                        FscalC[i]  = qq[i]*rinvC;
+                                    }
+                                    else
+                                    {
+                                        ewrt      = rC*ewtabscale;
+                                        ewitab    = (int) ewrt;
+                                        eweps     = ewrt-ewitab;
+                                        ewitab    = 4*ewitab;
+                                        FscalC[i] = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+                                        rinvcorr  = rinvC-sh_ewald;
+                                        Vcoul[i]  = qq[i]*(rinvcorr-(ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+FscalC[i])));
+                                        FscalC[i] = qq[i]*(rinvC-rC*FscalC[i]);
+                                    }
+                                    break;
+
                                  case GMX_NBKERNEL_ELEC_NONE:
-                                    FscalC[i]  = 0.0;
-                                    Vcoul[i]   = 0.0;
+                                    FscalC[i]  = zero;
+                                    Vcoul[i]   = zero;
                                      break;
  
                                  default:
@@ -532,46 +596,48 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
  
                              if (fr->coulomb_modifier == eintmodPOTSWITCH)
                              {
-                                d                = rC-rswitch;
-                                d                = (d > 0.0) ? d : 0.0;
+                                d                = rC-fr->rcoulomb_switch;
+                                d                = (d > zero) ? d : zero;
                                  d2               = d*d;
-                                sw               = 1.0+d2*d*(swV3+d*(swV4+d*swV5));
-                                dsw              = d2*(swF2+d*(swF3+d*swF4));
+                                sw               = one+d2*d*(elec_swV3+d*(elec_swV4+d*elec_swV5));
+                                dsw              = d2*(elec_swF2+d*(elec_swF3+d*elec_swF4));
+
+                                FscalC[i]        = FscalC[i]*sw - rC*Vcoul[i]*dsw;
+                                Vcoul[i]        *= sw;
  
-                                Vcoul[i]  *= sw;
-                                FscalC[i]  = FscalC[i]*sw + Vcoul[i]*dsw;
+                                FscalC[i]        = (rC < rcoulomb) ? FscalC[i] : zero;
+                                Vcoul[i]         = (rC < rcoulomb) ? Vcoul[i] : zero;
                              }
                          }
  
-                        if ((c6[i] != 0 || c12[i] != 0) &&
-                            !(bExactVdwCutoff &&
-                              ((ivdw != GMX_NBKERNEL_VDW_LJEWALD && rV >= rvdw) ||
-                               (ivdw == GMX_NBKERNEL_VDW_LJEWALD && r >= rvdw))))
+                        /* Only process the VDW interactions if we have
+                         * some non-zero parameters, and if we either
+                         * include all entries in the list (no cutoff used
+                         * in the kernel), or if we are within the cutoff.
+                         */
+                        bComputeVdwInteraction = !bExactVdwCutoff ||
+                            ( bConvertLJEwaldToLJ6 && r < rvdw) ||
+                            (!bConvertLJEwaldToLJ6 && rV < rvdw);
+                        if ((c6[i] != 0 || c12[i] != 0) && bComputeVdwInteraction)
                          {
                              switch (ivdw)
                              {
                                  case GMX_NBKERNEL_VDW_LENNARDJONES:
-                                case GMX_NBKERNEL_VDW_LJEWALD:
                                      /* cutoff LJ */
-                                    if (sc_r_power == 6.0)
+                                    if (sc_r_power == six)
                                      {
                                          rinv6            = rpinvV;
                                      }
                                      else
                                      {
-                                        rinv6            = pow(rinvV, 6.0);
+                                        rinv6            = rinvV*rinvV;
+                                        rinv6            = rinv6*rinv6*rinv6;
                                      }
                                      Vvdw6            = c6[i]*rinv6;
                                      Vvdw12           = c12[i]*rinv6*rinv6;
-                                    if (fr->vdw_modifier == eintmodPOTSHIFT)
-                                    {
-                                        Vvdw[i]          = ( (Vvdw12-c12[i]*sh_invrc6*sh_invrc6)*(1.0/12.0)
-                                                             -(Vvdw6-c6[i]*sh_invrc6)*(1.0/6.0));
-                                    }
-                                    else
-                                    {
-                                        Vvdw[i]          = Vvdw12*(1.0/12.0) - Vvdw6*(1.0/6.0);
-                                    }
+
+                                    Vvdw[i]          = ( (Vvdw12 - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth
+                                                         - (Vvdw6 - c6[i]*sh_invrc6)*onesixth);
                                      FscalV[i]        = Vvdw12 - Vvdw6;
                                      break;
  
@@ -589,7 +655,7 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                                      Heps2      = eps2V*VFtab[nnn+3];
                                      Fp         = F+Geps+Heps2;
                                      VV         = Y+epsV*Fp;
-                                    FF         = Fp+Geps+2.0*Heps2;
+                                    FF         = Fp+Geps+two*Heps2;
                                      Vvdw[i]   += c6[i]*VV;
                                      FscalV[i] -= c6[i]*tabscale*FF*rV;
  
@@ -600,14 +666,49 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                                      Heps2      = eps2V*VFtab[nnn+7];
                                      Fp         = F+Geps+Heps2;
                                      VV         = Y+epsV*Fp;
-                                    FF         = Fp+Geps+2.0*Heps2;
+                                    FF         = Fp+Geps+two*Heps2;
                                      Vvdw[i]   += c12[i]*VV;
                                      FscalV[i] -= c12[i]*tabscale*FF*rV;
                                      break;
  
+                                case GMX_NBKERNEL_VDW_LJEWALD:
+                                    if (sc_r_power == six)
+                                    {
+                                        rinv6            = rpinvV;
+                                    }
+                                    else
+                                    {
+                                        rinv6            = rinvV*rinvV;
+                                        rinv6            = rinv6*rinv6*rinv6;
+                                    }
+                                    c6grid           = nbfp_grid[tj[i]];
+
+                                    if (bConvertLJEwaldToLJ6)
+                                    {
+                                        /* cutoff LJ */
+                                        Vvdw6            = c6[i]*rinv6;
+                                        Vvdw12           = c12[i]*rinv6*rinv6;
+
+                                        Vvdw[i]          = ( (Vvdw12 - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth
+                                                             - (Vvdw6 - c6[i]*sh_invrc6 - c6grid*sh_lj_ewald)*onesixth);
+                                        FscalV[i]        = Vvdw12 - Vvdw6;
+                                    }
+                                    else
+                                    {
+                                        /* Normal LJ-PME */
+                                        ewcljrsq         = ewclj2*rV*rV;
+                                        exponent         = exp(-ewcljrsq);
+                                        poly             = exponent*(one + ewcljrsq + ewcljrsq*ewcljrsq*half);
+                                        vvdw_disp        = (c6[i]-c6grid*(one-poly))*rinv6;
+                                        vvdw_rep         = c12[i]*rinv6*rinv6;
+                                        FscalV[i]        = vvdw_rep - vvdw_disp - c6grid*onesixth*exponent*ewclj6;
+                                        Vvdw[i]          = (vvdw_rep - c12[i]*sh_invrc6*sh_invrc6)*onetwelfth - (vvdw_disp - c6[i]*sh_invrc6 - c6grid*sh_lj_ewald)/six;
+                                    }
+                                    break;
+
                                  case GMX_NBKERNEL_VDW_NONE:
-                                    Vvdw[i]    = 0.0;
-                                    FscalV[i]  = 0.0;
+                                    Vvdw[i]    = zero;
+                                    FscalV[i]  = zero;
                                      break;
  
                                  default:
@@ -617,17 +718,17 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
  
                              if (fr->vdw_modifier == eintmodPOTSWITCH)
                              {
-                                d          = rV-rswitch;
-                                d          = (d > 0.0) ? d : 0.0;
-                                d2         = d*d;
-                                sw         = 1.0+d2*d*(swV3+d*(swV4+d*swV5));
-                                dsw        = d2*(swF2+d*(swF3+d*swF4));
+                                d                = rV-fr->rvdw_switch;
+                                d                = (d > zero) ? d : zero;
+                                d2               = d*d;
+                                sw               = one+d2*d*(vdw_swV3+d*(vdw_swV4+d*vdw_swV5));
+                                dsw              = d2*(vdw_swF2+d*(vdw_swF3+d*vdw_swF4));
  
-                                Vvdw[i]   *= sw;
-                                FscalV[i]  = FscalV[i]*sw + Vvdw[i]*dsw;
+                                FscalV[i]        = FscalV[i]*sw - rV*Vvdw[i]*dsw;
+                                Vvdw[i]         *= sw;
  
-                                FscalV[i]  = (rV < rvdw) ? FscalV[i] : 0.0;
-                                Vvdw[i]    = (rV < rvdw) ? Vvdw[i] : 0.0;
+                                FscalV[i]  = (rV < rvdw) ? FscalV[i] : zero;
+                                Vvdw[i]    = (rV < rvdw) ? Vvdw[i] : zero;
                              }
                          }
  
@@ -662,11 +763,11 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                   * As there is no singularity, there is no need for soft-core.
                   */
                  VV = krf*rsq - crf;
-                FF = -2.0*krf;
+                FF = -two*krf;
  
                  if (ii == jnr)
                  {
-                    VV *= 0.5;
+                    VV *= half;
                  }
  
                  for (i = 0; i < NSTATES; i++)
@@ -677,56 +778,92 @@ gmx_nb_free_energy_kernel(const t_nblist * gmx_restrict    nlist,
                  }
              }
  
-            if (icoul == GMX_NBKERNEL_ELEC_EWALD &&
-                !(bExactElecCutoff && r >= rcoulomb))
+            if (bConvertEwaldToCoulomb && ( !bExactElecCutoff || r < rcoulomb ) )
              {
-                /* Because we compute the soft-core normally,
-                 * we have to remove the Ewald short range portion.
-                 * Done outside of the states loop because this part
-                 * doesn't depend on the scaled R.
+                /* See comment in the preamble. When using Ewald interactions
+                 * (unless we use a switch modifier) we subtract the reciprocal-space
+                 * Ewald component here which made it possible to apply the free
+                 * energy interaction to 1/r (vanilla coulomb short-range part)
+                 * above. This gets us closer to the ideal case of applying
+                 * the softcore to the entire electrostatic interaction,
+                 * including the reciprocal-space component.
+                 */
+                real v_lr, f_lr;
+
+                ewrt      = r*ewtabscale;
+                ewitab    = (int) ewrt;
+                eweps     = ewrt-ewitab;
+                ewitab    = 4*ewitab;
+                f_lr      = ewtab[ewitab]+eweps*ewtab[ewitab+1];
+                v_lr      = (ewtab[ewitab+2]-ewtabhalfspace*eweps*(ewtab[ewitab]+f_lr));
+                f_lr     *= rinv;
+
+                /* Note that any possible Ewald shift has already been applied in
+                 * the normal interaction part above.
                   */
-                real rs, frac, f_lr;
-                int  ri;
-
-                rs     = rsq*rinv*tab_ewald_scale;
-                ri     = (int)rs;
-                frac   = rs - ri;
-                f_lr   = (1 - frac)*tab_ewald_F[ri] + frac*tab_ewald_F[ri+1];
-                FF     = f_lr*rinv;
-                VV     = tab_ewald_V[ri] - tab_ewald_halfsp*frac*(tab_ewald_F[ri] + f_lr);
  
                  if (ii == jnr)
                  {
-                    VV   *= 0.5;
+                    /* If we get here, the i particle (ii) has itself (jnr)
+                     * in its neighborlist. This can only happen with the Verlet
+                     * scheme, and corresponds to a self-interaction that will
+                     * occur twice. Scale it down by 50% to only include it once.
+                     */
+                    v_lr *= half;
                  }
  
                  for (i = 0; i < NSTATES; i++)
                  {
-                    vctot      -= LFC[i]*qq[i]*VV;
-                    Fscal      -= LFC[i]*qq[i]*FF;
-                    dvdl_coul  -= (DLF[i]*qq[i])*VV;
+                    vctot      -= LFC[i]*qq[i]*v_lr;
+                    Fscal      -= LFC[i]*qq[i]*f_lr;
+                    dvdl_coul  -= (DLF[i]*qq[i])*v_lr;
                  }
              }
  
-            if (ivdw == GMX_NBKERNEL_VDW_LJEWALD &&
-                !(bExactVdwCutoff && r >= rvdw))
+            if (bConvertLJEwaldToLJ6 && (!bExactVdwCutoff || r < rvdw))
              {
+                /* See comment in the preamble. When using LJ-Ewald interactions
+                 * (unless we use a switch modifier) we subtract the reciprocal-space
+                 * Ewald component here which made it possible to apply the free
+                 * energy interaction to r^-6 (vanilla LJ6 short-range part)
+                 * above. This gets us closer to the ideal case of applying
+                 * the softcore to the entire VdW interaction,
+                 * including the reciprocal-space component.
+                 */
+                /* We could also use the analytical form here
+                 * iso a table, but that can cause issues for
+                 * r close to 0 for non-interacting pairs.
+                 */
                  real rs, frac, f_lr;
                  int  ri;
  
-                rs     = rsq*rinv*tab_ewald_scale;
+                rs     = rsq*rinv*ewtabscale;
                  ri     = (int)rs;
                  frac   = rs - ri;
                  f_lr   = (1 - frac)*tab_ewald_F_lj[ri] + frac*tab_ewald_F_lj[ri+1];
-                FF     = f_lr*rinv;
-                VV     = tab_ewald_V_lj[ri] - tab_ewald_halfsp*frac*(tab_ewald_F_lj[ri] + f_lr);
-                for (i = 0; i < NSTATES; i++)
+                /* TODO: Currently the Ewald LJ table does not contain
+                 * the factor 1/6, we should add this.
+                 */
+                FF     = f_lr*rinv/six;
+                VV     = (tab_ewald_V_lj[ri] - ewtabhalfspace*frac*(tab_ewald_F_lj[ri] + f_lr))/six;
+
+                if (ii == jnr)
                  {
-                    vvtot      += LFV[i]*c6grid[i]*VV*(1.0/6.0);
-                    Fscal      += LFV[i]*c6grid[i]*FF*(1.0/6.0);
-                    dvdl_vdw   += (DLF[i]*c6grid[i])*VV*(1.0/6.0);
+                    /* If we get here, the i particle (ii) has itself (jnr)
+                     * in its neighborlist. This can only happen with the Verlet
+                     * scheme, and corresponds to a self-interaction that will
+                     * occur twice. Scale it down by 50% to only include it once.
+                     */
+                    VV *= half;
                  }
  
+                for (i = 0; i < NSTATES; i++)
+                {
+                    c6grid      = nbfp_grid[tj[i]];
+                    vvtot      += LFV[i]*c6grid*VV;
+                    Fscal      += LFV[i]*c6grid*FF;
+                    dvdl_vdw   += (DLF[i]*c6grid)*VV;
+                }
              }
  
              if (bDoForces)
@@ -816,6 +953,12 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
      real       velec[2], vvdw[2];
      int        i, ntab;
  
+    const real half        = 0.5;
+    const real one         = 1.0;
+    const real two         = 2.0;
+    const real six         = 6.0;
+    const real fourtyeight = 48.0;
+
      qq[0]    = qqA;
      qq[1]    = qqB;
      c6[0]    = c6A;
@@ -823,12 +966,12 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
      c12[0]   = c12A;
      c12[1]   = c12B;
  
-    if (sc_r_power == 6.0)
+    if (sc_r_power == six)
      {
          rpm2             = r2*r2;   /* r4 */
          rp               = rpm2*r2; /* r6 */
      }
-    else if (sc_r_power == 48.0)
+    else if (sc_r_power == fourtyeight)
      {
          rp               = r2*r2*r2; /* r6 */
          rp               = rp*rp;    /* r12 */
@@ -838,7 +981,7 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
      }
      else
      {
-        rp             = pow(r2, 0.5*sc_r_power);  /* not currently supported as input, but can handle it */
+        rp             = pow(r2, half*sc_r_power);  /* not currently supported as input, but can handle it */
          rpm2           = rp/r2;
      }
  
@@ -850,8 +993,8 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
              /* The c6 & c12 coefficients now contain the constants 6.0 and 12.0, respectively.
               * Correct for this by multiplying with (1/12.0)/(1/6.0)=6.0/12.0=0.5.
               */
-            sigma6[i]       = 0.5*c12[i]/c6[i];
-            sigma2[i]       = pow(0.5*c12[i]/c6[i], 1.0/3.0);
+            sigma6[i]       = half*c12[i]/c6[i];
+            sigma2[i]       = pow(half*c12[i]/c6[i], 1.0/3.0);
              /* should be able to get rid of this ^^^ internal pow call eventually.  Will require agreement on
                 what data to store externally.  Can't be fixed without larger scale changes, so not 5.0 */
              if (sigma6[i] < sigma6_min)   /* for disappearing coul and vdw with soft core at the same time */
@@ -865,12 +1008,12 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
              sigma6[i]       = sigma6_def;
              sigma2[i]       = sigma2_def;
          }
-        if (sc_r_power == 6.0)
+        if (sc_r_power == six)
          {
              sigma_pow[i]    = sigma6[i];
              sigma_powm2[i]  = sigma6[i]/sigma2[i];
          }
-        else if (sc_r_power == 48.0)
+        else if (sc_r_power == fourtyeight)
          {
              sigma_pow[i]    = sigma6[i]*sigma6[i];       /* sigma^12 */
              sigma_pow[i]    = sigma_pow[i]*sigma_pow[i]; /* sigma^24 */
@@ -908,8 +1051,8 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
          if ( (qq[i] != 0) || (c6[i] != 0) || (c12[i] != 0) )
          {
              /* Coulomb */
-            rpinv            = 1.0/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp);
-            r_coul           = pow(rpinv, -1.0/sc_r_power);
+            rpinv            = one/(alpha_coul_eff*lfac_coul[i]*sigma_pow[i]+rp);
+            r_coul           = pow(rpinv, -one/sc_r_power);
  
              /* Electrostatics table lookup data */
              rtab             = r_coul*tabscale;
@@ -924,13 +1067,13 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
              Heps2            = eps2*vftab[ntab+3];
              Fp               = F+Geps+Heps2;
              VV               = Y+eps*Fp;
-            FF               = Fp+Geps+2.0*Heps2;
+            FF               = Fp+Geps+two*Heps2;
              velec[i]         = qq[i]*VV;
              fscal_elec[i]    = -qq[i]*FF*r_coul*rpinv*tabscale;
  
              /* Vdw */
-            rpinv            = 1.0/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp);
-            r_vdw            = pow(rpinv, -1.0/sc_r_power);
+            rpinv            = one/(alpha_vdw_eff*lfac_vdw[i]*sigma_pow[i]+rp);
+            r_vdw            = pow(rpinv, -one/sc_r_power);
              /* Vdw table lookup data */
              rtab             = r_vdw*tabscale;
              ntab             = rtab;
@@ -944,7 +1087,7 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
              Heps2            = eps2*vftab[ntab+7];
              Fp               = F+Geps+Heps2;
              VV               = Y+eps*Fp;
-            FF               = Fp+Geps+2.0*Heps2;
+            FF               = Fp+Geps+two*Heps2;
              vvdw[i]          = c6[i]*VV;
              fscal_vdw[i]     = -c6[i]*FF;
  
@@ -955,7 +1098,7 @@ nb_free_energy_evaluate_single(real r2, real sc_r_power, real alpha_coul, real a
              Heps2            = eps2*vftab[ntab+11];
              Fp               = F+Geps+Heps2;
              VV               = Y+eps*Fp;
-            FF               = Fp+Geps+2.0*Heps2;
+            FF               = Fp+Geps+two*Heps2;
              vvdw[i]         += c12[i]*VV;
              fscal_vdw[i]    -= c12[i]*FF;
              fscal_vdw[i]    *= r_vdw*rpinv*tabscale;