From 8b44f79a5288f5bc70af330d9e7fa79f45763889 Mon Sep 17 00:00:00 2001
From: Szilard Pall <pszilard@cbr.su.se>
Date: Tue, 12 Mar 2013 03:18:43 +0100
Subject: [PATCH] CUDA PME kernels with analytical Ewald correction

The analytical Ewald kernels have been used in the CPU SIMD kernels, but
due to CUDA compiler issues it has been difficult to determine in which
cases does this provide a performance advantage compared to the
tabulated kernels.Although the nvcc optimizations are rather unreliable,
on Kepler (SM 3.x) the analytical Ewald kernels are up to 5% faster, but
on Fermi (SM 2.x) 7% slower than the tabulated. Hence, this commit
enables the analytical kernels as default for Kepler GPUs, but keeps the
tabulated kernels as default on Fermi.

Note that the analytical Ewald correction is not implemented in the
legacy kernels as these are anyway only used on Fermi.

Additional minor change is the back-port of some variable (re)naming and
simple optimizations from the default to the legacy CUDA kernels which
give 2-3% performance improvement and better code readability.

Change-Id: Idd4659ef3805609356fe8865dc57fd19b0b614fe
---
 src/mdlib/nbnxn_cuda/nbnxn_cuda.cu            | 79 +++++++++++-----
 src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  | 90 +++++++++++++------
 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh    | 33 ++++---
 .../nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh   | 60 ++++++-------
 .../nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh    | 40 +++++++++
 src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh   | 50 ++++++++---
 src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h       | 18 +++-
 7 files changed, 263 insertions(+), 107 deletions(-)

diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 2611660b2b..de3bdc9bf7 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -75,8 +75,13 @@ texture<float, 1, cudaReadModeElementType> tex_coulomb_tab;
 /***** The kernels come here *****/
 #include "nbnxn_cuda_kernel_utils.cuh"
 
-/* Generate all combinations of kernels through multiple inclusion:
-   F, F + E, F + prune, F + E + prune. */
+/* Top-level kernel generation: will generate through multiple inclusion the
+ * following flavors for all kernels:
+ * - force-only output;
+ * - force and energy output;
+ * - force-only with pair list pruning;
+ * - force and energy output with pair list pruning.
+ */
 /** Force only **/
 #include "nbnxn_cuda_kernels.cuh"
 /** Force & energy **/
@@ -126,7 +131,7 @@ static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
     /* do we exceed the grid x dimension limit? */
     if (nwork_units > max_grid_x_size)
     {
-        gmx_fatal(FARGS, "Watch out system too large to simulate!\n"
+        gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
                   "The number of nonbonded work units (=number of super-clusters) exceeds the"
                   "maximum grid size in x dimension (%d > %d)!", nwork_units, max_grid_x_size);
     }
@@ -141,32 +146,46 @@ static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
 static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
 
-/* Default kernels */
+/*! Pointers to the default kernels organized in a 3 dim array by:
+ *  electrostatics type, energy calculation on/off, and pruning on/off.
+ *
+ *  Note that the order of electrostatics (1st dimension) has to match the
+ *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ */
 static const nbnxn_cu_kfunc_ptr_t
 nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 {
-    { { k_nbnxn_ewald,              k_nbnxn_ewald_prune },
-      { k_nbnxn_ewald_ener,         k_nbnxn_ewald_ener_prune } },
-    { { k_nbnxn_ewald_twin,         k_nbnxn_ewald_twin_prune },
-      { k_nbnxn_ewald_twin_ener,    k_nbnxn_ewald_twin_ener_prune } },
-    { { k_nbnxn_rf,                 k_nbnxn_rf_prune },
-      { k_nbnxn_rf_ener,            k_nbnxn_rf_ener_prune } },
-    { { k_nbnxn_cutoff,             k_nbnxn_cutoff_prune },
-      { k_nbnxn_cutoff_ener,        k_nbnxn_cutoff_ener_prune } },
+    { { k_nbnxn_cutoff,                     k_nbnxn_cutoff_prune },
+      { k_nbnxn_cutoff_ener,                k_nbnxn_cutoff_ener_prune } },
+    { { k_nbnxn_rf,                         k_nbnxn_rf_prune },
+      { k_nbnxn_rf_ener,                    k_nbnxn_rf_ener_prune } },
+    { { k_nbnxn_ewald_tab,                  k_nbnxn_ewald_tab_prune },
+      { k_nbnxn_ewald_tab_ener,             k_nbnxn_ewald_tab_ener_prune } },
+    { { k_nbnxn_ewald_tab_twin,             k_nbnxn_ewald_tab_twin_prune },
+      { k_nbnxn_ewald_tab_twin_ener,        k_nbnxn_ewald_twin_ener_prune } },
+    { { k_nbnxn_ewald,                      k_nbnxn_ewald_prune },
+      { k_nbnxn_ewald_ener,                 k_nbnxn_ewald_ener_prune } },
+    { { k_nbnxn_ewald_twin,                 k_nbnxn_ewald_twin_prune },
+      { k_nbnxn_ewald_twin_ener,            k_nbnxn_ewald_twin_ener_prune } },
 };
 
-/* Legacy kernels */
+/*! Pointers to the legacy kernels organized in a 3 dim array by:
+ *  electrostatics type, energy calculation on/off, and pruning on/off.
+ *
+ *  Note that the order of electrostatics (1st dimension) has to match the
+ *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ */
 static const nbnxn_cu_kfunc_ptr_t
 nb_legacy_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
 {
-    { { k_nbnxn_ewald_legacy,           k_nbnxn_ewald_prune_legacy },
-      { k_nbnxn_ewald_ener_legacy,      k_nbnxn_ewald_ener_prune_legacy } },
-    { { k_nbnxn_ewald_twin_legacy,      k_nbnxn_ewald_twin_prune_legacy },
-      { k_nbnxn_ewald_twin_ener_legacy, k_nbnxn_ewald_twin_ener_prune_legacy } },
-    { { k_nbnxn_rf_legacy,              k_nbnxn_rf_prune_legacy },
-      { k_nbnxn_rf_ener_legacy,         k_nbnxn_rf_ener_prune_legacy } },
-    { { k_nbnxn_cutoff_legacy,          k_nbnxn_cutoff_prune_legacy },
-      { k_nbnxn_cutoff_ener_legacy,     k_nbnxn_cutoff_ener_prune_legacy } },
+    { { k_nbnxn_cutoff_legacy,              k_nbnxn_cutoff_prune_legacy },
+      { k_nbnxn_cutoff_ener_legacy,         k_nbnxn_cutoff_ener_prune_legacy } },
+    { { k_nbnxn_rf_legacy,                  k_nbnxn_rf_prune_legacy },
+      { k_nbnxn_rf_ener_legacy,             k_nbnxn_rf_ener_prune_legacy } },
+    { { k_nbnxn_ewald_tab_legacy,           k_nbnxn_ewald_tab_prune_legacy },
+      { k_nbnxn_ewald_tab_ener_legacy,      k_nbnxn_ewald_tab_ener_prune_legacy } },
+    { { k_nbnxn_ewald_tab_twin_legacy,      k_nbnxn_ewald_tab_twin_prune_legacy },
+      { k_nbnxn_ewald_tab_twin_ener_legacy, k_nbnxn_ewald_tab_twin_ener_prune_legacy } },
 };
 
 /*! Return a pointer to the kernel version to be executed at the current step. */
@@ -178,6 +197,9 @@ static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int kver, int eeltype,
 
     if (NBNXN_KVER_LEGACY(kver))
     {
+        /* no analytical Ewald with legacy kernels */
+        assert(eeltype <= eelCuEWALD_TAB_TWIN);
+
         return nb_legacy_kfunc_ptr[eeltype][bDoEne][bDoPrune];
     }
     else
@@ -656,12 +678,19 @@ void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
     cudaError_t stat;
 
     for (int i = 0; i < eelCuNR; i++)
+    {
         for (int j = 0; j < nEnergyKernelTypes; j++)
+        {
             for (int k = 0; k < nPruneKernelTypes; k++)
             {
-                /* Legacy kernel 16/48 kB Shared/L1 */
-                stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
-                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+                /* Legacy kernel 16/48 kB Shared/L1
+                 * No analytical Ewald!
+                 */
+                if (i != eelCuEWALD_ANA && i != eelCuEWALD_ANA_TWIN)
+                {
+                    stat = cudaFuncSetCacheConfig(nb_legacy_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
+                    CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+                }
 
                 if (devinfo->prop.major >= 3)
                 {
@@ -676,4 +705,6 @@ void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
                 }
                 CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
             }
+        }
+    }
 }
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 3107e25460..f2d427b18c 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -180,10 +180,65 @@ static void init_atomdata_first(cu_atomdata_t *ad, int ntypes)
     ad->nalloc = -1;
 }
 
+/*! Selects the Ewald kernel type, analytical on SM 3.0 and later, tabulated on
+    earlier GPUs, single or twin cut-off. */
+static int pick_ewald_kernel_type(bool                   bTwinCut,
+                                  const cuda_dev_info_t *dev_info)
+{
+    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
+    int  kernel_type;
+
+    /* Benchmarking/development environment variables to force the use of
+       analytical or tabulated Ewald kernel. */
+    bForceAnalyticalEwald = (getenv("GMX_CUDA_NB_ANA_EWALD") != NULL);
+    bForceTabulatedEwald  = (getenv("GMX_CUDA_NB_TAB_EWALD") != NULL);
+
+    if (bForceAnalyticalEwald && bForceTabulatedEwald)
+    {
+        gmx_incons("Both analytical and tabulated Ewald CUDA non-bonded kernels "
+                   "requested through environment variables.");
+    }
+
+    /* By default, on SM 3.0 and later use analytical Ewald, on earlier tabulated. */
+    if ((dev_info->prop.major >= 3 || bForceAnalyticalEwald) && !bForceTabulatedEwald)
+    {
+        bUseAnalyticalEwald = true;
+
+        if (debug)
+        {
+            fprintf(debug, "Using analytical Ewald CUDA kernels\n");
+        }
+    }
+    else
+    {
+        bUseAnalyticalEwald = false;
+
+        if (debug)
+        {
+            fprintf(debug, "Using tabulated Ewald CUDA kernels\n");
+        }
+    }
+
+    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
+       forces it (use it for debugging/benchmarking only). */
+    if (!bTwinCut && (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL))
+    {
+        kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA : eelCuEWALD_TAB;
+    }
+    else
+    {
+        kernel_type = bUseAnalyticalEwald ? eelCuEWALD_ANA_TWIN : eelCuEWALD_TAB_TWIN;
+    }
+
+    return kernel_type;
+}
+
+
 /*! Initializes the nonbonded parameter data structure. */
 static void init_nbparam(cu_nbparam_t *nbp,
                          const interaction_const_t *ic,
-                         const nonbonded_verlet_t *nbv)
+                         const nonbonded_verlet_t *nbv,
+                         const cuda_dev_info_t *dev_info)
 {
     cudaError_t stat;
     int         ntypes, nnbfp;
@@ -210,16 +265,8 @@ static void init_nbparam(cu_nbparam_t *nbp,
     }
     else if ((EEL_PME(ic->eeltype) || ic->eeltype==eelEWALD))
     {
-        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off, unless
-           forced by the env. var. (used only for benchmarking). */
-        if (getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
-        {
-            nbp->eeltype = eelCuEWALD;
-        }
-        else
-        {
-            nbp->eeltype = eelCuEWALD_TWIN;
-        }
+        /* Initially rcoulomb == rvdw, so it's surely not twin cut-off. */
+        nbp->eeltype = pick_ewald_kernel_type(false, dev_info);
     }
     else
     {
@@ -228,9 +275,9 @@ static void init_nbparam(cu_nbparam_t *nbp,
     }
 
     /* generate table for PME */
-    if (nbp->eeltype == eelCuEWALD)
+    nbp->coulomb_tab = NULL;
+    if (nbp->eeltype == eelCuEWALD_TAB || nbp->eeltype == eelCuEWALD_TAB_TWIN)
     {
-        nbp->coulomb_tab = NULL;
         init_ewald_coulomb_force_table(nbp);
     }
 
@@ -256,17 +303,8 @@ void nbnxn_cuda_pme_loadbal_update_param(nbnxn_cuda_ptr_t cu_nb,
     nbp->rcoulomb_sq    = ic->rcoulomb * ic->rcoulomb;
     nbp->ewald_beta     = ic->ewaldcoeff;
 
-    /* When switching to/from twin cut-off, the electrostatics type needs updating.
-       (The env. var. that forces twin cut-off is for benchmarking only!) */
-    if (ic->rcoulomb == ic->rvdw &&
-        getenv("GMX_CUDA_NB_EWALD_TWINCUT") == NULL)
-    {
-        nbp->eeltype = eelCuEWALD;
-    }
-    else
-    {
-        nbp->eeltype = eelCuEWALD_TWIN;
-    }
+    nbp->eeltype        = pick_ewald_kernel_type(ic->rcoulomb != ic->rvdw,
+                                                 cu_nb->dev_info);
 
     init_ewald_coulomb_force_table(cu_nb->nbparam);
 }
@@ -609,7 +647,7 @@ void nbnxn_cuda_init_const(nbnxn_cuda_ptr_t cu_nb,
                            const nonbonded_verlet_t *nbv)
 {
     init_atomdata_first(cu_nb->atdat, nbv->grp[0].nbat->ntype);
-    init_nbparam(cu_nb->nbparam, ic, nbv);
+    init_nbparam(cu_nb->nbparam, ic, nbv, cu_nb->dev_info);
 
     /* clear energy and shift force outputs */
     nbnxn_cuda_clear_e_fshift(cu_nb);
@@ -804,7 +842,7 @@ void nbnxn_cuda_free(FILE *fplog, nbnxn_cuda_ptr_t cu_nb)
     plist_nl    = cu_nb->plist[eintNonlocal];
     timers      = cu_nb->timers;
 
-    if (nbparam->eeltype == eelCuEWALD || nbparam->eeltype == eelCuEWALD_TWIN)
+    if (nbparam->eeltype == eelCuEWALD_TAB || nbparam->eeltype == eelCuEWALD_TAB_TWIN)
     {
       stat = cudaUnbindTexture(nbnxn_cuda_get_coulomb_tab_texref());
       CU_RET_ERR(stat, "cudaUnbindTexture on coulomb_tab failed");
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 23df93a292..73acdfecd5 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -49,6 +49,11 @@
 #define IATYPE_SHMEM
 #endif
 
+#if defined EL_EWALD_ANA || defined EL_EWALD_TAB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define EL_EWALD_ANY
+#endif
+
 /*
    Kernel launch parameters:
     - #blocks   = #pair lists, blockId = pair list Id
@@ -95,16 +100,20 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #ifdef EL_RF
     float two_k_rf              = nbparam.two_k_rf;
 #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
     float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
 #endif
+#ifdef EL_EWALD_ANA
+    float beta2                 = nbparam.ewald_beta*nbparam.ewald_beta;
+    float beta3                 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
+#endif
 #ifdef PRUNE_NBL
     float rlist_sq              = nbparam.rlist_sq;
 #endif
 
 #ifdef CALC_ENERGIES
     float lj_shift    = nbparam.sh_invrc6;
-#ifdef EL_EWALD
+#ifdef EL_EWALD_ANY
     float beta        = nbparam.ewald_beta;
     float ewald_shift = nbparam.sh_ewald;
 #else
@@ -185,7 +194,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
     E_lj = 0.0f;
     E_el = 0.0f;
 
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
     if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
     {
         /* we have the diagonal: add the charge self interaction energy term */
@@ -256,7 +265,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
                     fcj_buf = make_float3(0.0f);
 
                     /* The PME and RF kernels don't unroll with CUDA <v4.1. */
-#if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD || defined EL_RF))
+#if !defined PRUNE_NBL && !(CUDA_VERSION < 4010 && (defined EL_EWALD_ANY || defined EL_RF))
 #pragma unroll 8
 #endif
                     for(i = 0; i < NCL_PER_SUPERCL; i++)
@@ -289,7 +298,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
                             int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
 
                             /* cutoff & exclusion check */
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
                             if (r2 < rcoulomb_sq *
                                 (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
 #else
@@ -314,7 +323,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
                                 inv_r   = rsqrt(r2);
                                 inv_r2  = inv_r * inv_r;
                                 inv_r6  = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_ANY || defined EL_RF
                                 /* We could mask inv_r2, but with Ewald
                                  * masking both inv_r6 and F_invr is faster */
                                 inv_r6  *= int_bit;
@@ -345,9 +354,11 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #ifdef EL_RF
                                 F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
 #endif
-#ifdef EL_EWALD
+#if defined EL_EWALD_ANA
+                                F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
+#elif defined EL_EWALD_TAB
                                 F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
-#endif
+#endif /* EL_EWALD_ANA/TAB */
 
 #ifdef CALC_ENERGIES
 #ifdef EL_CUTOFF
@@ -356,10 +367,10 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #ifdef EL_RF
                                 E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
 #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_ANY
                                 /* 1.0f - erff is faster than erfcf */
                                 E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
-#endif
+#endif /* EL_EWALD_ANY */
 #endif
                                 f_ij    = rv * F_invr;
 
@@ -440,3 +451,5 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn)
 #endif
 #endif
 }
+
+#undef EL_EWALD_ANY
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
index 0733d9412e..cff062d7a1 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_legacy.cuh
@@ -42,6 +42,12 @@
  * code that is in double precision.
  */
 
+/* Analytical Ewald is not implemented for the legacy kernels (as it is anyway
+   slower than the tabulated kernel on Fermi). */
+#ifdef EL_EWALD_ANA
+#error Trying to generate Analytical Ewald legacy kernels which is not implemented in the legacy kernels!
+#endif
+
 /*
    Kernel launch parameters:
     - #blocks   = #pair lists, blockId = pair list Id
@@ -88,7 +94,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
 #ifdef EL_RF
     float two_k_rf              = nbparam.two_k_rf;
 #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
     float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
 #endif
 #ifdef PRUNE_NBL
@@ -97,7 +103,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
 
 #ifdef CALC_ENERGIES
     float lj_shift    = nbparam.sh_invrc6;
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
     float beta        = nbparam.ewald_beta;
     float ewald_shift = nbparam.sh_ewald;
 #else
@@ -122,14 +128,12 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
     float qi, qj_f,
           r2, inv_r, inv_r2, inv_r6,
           c6, c12,
+          int_bit,
 #ifdef CALC_ENERGIES
           E_lj, E_el, E_lj_p,
 #endif
           F_invr;
-    unsigned int wexcl, int_bit, imask, imask_j;
-#ifdef PRUNE_NBL
-    unsigned int imask_prune;
-#endif
+    unsigned int wexcl, imask, mask_ji;
     float4 xqbuf;
     float3 xi, xj, rv, f_ij, fcj_buf, fshift_buf;
     float3 fci_buf[NCL_PER_SUPERCL];    /* i force buffer */
@@ -162,7 +166,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
     E_lj = 0.0f;
     E_el = 0.0f;
 
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
     if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
     {
         /* we have the diagonal: add the charge self interaction energy term */
@@ -201,20 +205,16 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
         if (imask)
 #endif
         {
-#ifdef PRUNE_NBL
-            imask_prune = imask;
-#endif
-
             /* nvcc >v4.1 doesn't like this loop, it refuses to unroll it */
 #if CUDA_VERSION >= 4010
             #pragma unroll 4
 #endif
             for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
             {
-                imask_j = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
-                if (imask_j)
+                mask_ji = (imask >> (jm * CL_SIZE)) & supercl_interaction_mask;
+                if (mask_ji)
                 {
-                    nsubi = __popc(imask_j);
+                    nsubi = __popc(mask_ji);
 
                     cj      = pl_cj4[j4].cj[jm];
                     aj      = cj * CL_SIZE + tidxj;
@@ -233,8 +233,8 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                        which is a pity because here unrolling could help.  */
                     for (cii = 0; cii < nsubi; cii++)
                     {
-                        i = __ffs(imask_j) - 1;
-                        imask_j &= ~(1U << i);
+                        i = __ffs(mask_ji) - 1;
+                        mask_ji &= ~(1U << i);
 
                         ci_offset   = i;    /* i force buffer offset */
 
@@ -255,14 +255,14 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                                cluster-pair in imask gets set to 0. */
                         if (!__any(r2 < rlist_sq))
                         {
-                            imask_prune &= ~(1U << (jm * NCL_PER_SUPERCL + i));
+                            imask &= ~(1U << (jm * NCL_PER_SUPERCL + i));
                         }
 #endif
 
-                        int_bit = ((wexcl >> (jm * NCL_PER_SUPERCL + i)) & 1);
+                        int_bit = ((wexcl >> (jm * NCL_PER_SUPERCL + i)) & 1) ? 1.0f : 0.0f;
 
                         /* cutoff & exclusion check */
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
                         if (r2 < rcoulomb_sq *
                             (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
 #else
@@ -283,7 +283,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                             inv_r   = rsqrt(r2);
                             inv_r2  = inv_r * inv_r;
                             inv_r6  = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD || defined EL_RF
+#if defined EL_EWALD_TAB || defined EL_RF
                             /* We could mask inv_r2, but with Ewald
                              * masking both inv_r6 and F_invr is faster */
                             inv_r6  *= int_bit;
@@ -292,19 +292,19 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
                             F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
 
 #ifdef CALC_ENERGIES
-                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
+                            E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
 #endif
 
 #ifdef VDW_CUTOFF_CHECK
-                                /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
-                                vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
-                                F_invr  *= vdw_in_range;
+                            /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
+                            vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                            F_invr  *= vdw_in_range;
 #ifdef CALC_ENERGIES
-                                E_lj_p  *= vdw_in_range;
+                            E_lj_p  *= vdw_in_range;
 #endif
 #endif
 #ifdef CALC_ENERGIES
-                                E_lj    += E_lj_p;
+                            E_lj    += E_lj_p;
 #endif
 
 
@@ -314,9 +314,9 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
 #ifdef EL_RF
                             F_invr  += qi * qj_f * (int_bit*inv_r2 * inv_r - two_k_rf);
 #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
                             F_invr  += qi * qj_f * (int_bit*inv_r2 - interpolate_coulomb_force_r(r2 * inv_r, coulomb_tab_scale)) * inv_r;
-#endif
+#endif /* EL_EWALD_TAB */
 
 #ifdef CALC_ENERGIES
 #ifdef EL_CUTOFF
@@ -325,7 +325,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
 #ifdef EL_RF
                             E_el    += qi * qj_f * (int_bit*inv_r + 0.5f * two_k_rf * r2 - c_rf);
 #endif
-#ifdef EL_EWALD
+#ifdef EL_EWALD_TAB
                             /* 1.0f - erff is faster than erfcf */
                             E_el    += qi * qj_f * (inv_r * (int_bit - erff(r2 * inv_r * beta)) - int_bit * ewald_shift);
 #endif
@@ -352,7 +352,7 @@ __global__ void NB_KERNEL_FUNC_NAME(k_nbnxn, _legacy)
 #ifdef PRUNE_NBL
             /* Update the imask with the new one which does not contain the
                out of range clusters anymore. */
-            pl_cj4[j4].imei[widx].imask = imask_prune;
+            pl_cj4[j4].imei[widx].imask = imask;
 #endif
         }
     }
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index 4992f2beeb..ad08ffa5e6 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -69,6 +69,46 @@ float interpolate_coulomb_force_r(float r, float scale)
             + fract2 * tex1Dfetch(tex_coulomb_tab, index + 1);
 }
 
+/*! Calculate analytical Ewald correction term. */
+static inline __device__
+float pmecorrF(float z2)
+{
+    const float FN6 = -1.7357322914161492954e-8f;
+    const float FN5 = 1.4703624142580877519e-6f;
+    const float FN4 = -0.000053401640219807709149f;
+    const float FN3 = 0.0010054721316683106153f;
+    const float FN2 = -0.019278317264888380590f;
+    const float FN1 = 0.069670166153766424023f;
+    const float FN0 = -0.75225204789749321333f;
+
+    const float FD4 = 0.0011193462567257629232f;
+    const float FD3 = 0.014866955030185295499f;
+    const float FD2 = 0.11583842382862377919f;
+    const float FD1 = 0.50736591960530292870f;
+    const float FD0 = 1.0f;
+
+    float       z4;
+    float       polyFN0,polyFN1,polyFD0,polyFD1;
+
+    z4          = z2*z2;
+
+    polyFD0     = FD4*z4 + FD2;
+    polyFD1     = FD3*z4 + FD1;
+    polyFD0     = polyFD0*z4 + FD0;
+    polyFD0     = polyFD1*z2 + polyFD0;
+
+    polyFD0     = 1.0f/polyFD0;
+
+    polyFN0     = FN6*z4 + FN4;
+    polyFN1     = FN5*z4 + FN3;
+    polyFN0     = polyFN0*z4 + FN2;
+    polyFN1     = polyFN1*z4 + FN1;
+    polyFN0     = polyFN0*z4 + FN0;
+    polyFN0     = polyFN1*z2 + polyFN0;
+
+    return polyFN0*polyFD0;
+}
+
 /*! Final j-force reduction; this generic implementation works with
  *  arbitrary array sizes.
  */
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index 64c1008eb7..78aaa0dd46 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -37,15 +37,17 @@
  */
 
 /*! \file
- *  This header has the sole purpose of generating kernels for the different
- *  type of electrostatics supported: Cut-off, Reaction-Field, and Ewald/PME;
- *  the latter has a twin-range cut-off version (rcoul!=rvdw) which enables
- *  PME tuning (otherwise in the Verlet scheme rcoul==rvdw).
+ *  This header has the sole purpose of generating kernels for the supported
+ *  electrostatics types: cut-off, reaction-field, Ewald, and tabulated Ewald.
  *
- *  (No include fence as it is meant to be included multiple times.)
+ *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
+ *  require an extra distance check to enable  PP-PME load balancing
+ *  (otherwise, by default rcoul == rvdw).
+ *
+ *  NOTE: No include fence as it is meant to be included multiple times.
  */
 
-/* Cut-Off */
+/* Analytical plain cut-off kernels */
 #define EL_CUTOFF
 #define NB_KERNEL_FUNC_NAME(x,...) x##_cutoff##__VA_ARGS__
 #include "nbnxn_cuda_kernel_legacy.cuh"
@@ -53,7 +55,7 @@
 #undef EL_CUTOFF
 #undef NB_KERNEL_FUNC_NAME
 
-/* Reaction-Field */
+/* Analytical reaction-field kernels */
 #define EL_RF
 #define NB_KERNEL_FUNC_NAME(x,...) x##_rf##__VA_ARGS__
 #include "nbnxn_cuda_kernel_legacy.cuh"
@@ -61,20 +63,40 @@
 #undef EL_RF
 #undef NB_KERNEL_FUNC_NAME
 
-/* Ewald */
-#define EL_EWALD
+/* Analytical Ewald interaction kernels
+ * NOTE: no legacy kernels with analytical Ewald.
+ */
+#define EL_EWALD_ANA
 #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald##__VA_ARGS__
-#include "nbnxn_cuda_kernel_legacy.cuh"
 #include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD
+#undef EL_EWALD_ANA
 #undef NB_KERNEL_FUNC_NAME
 
-/* Ewald with twin-range cut-off */
-#define EL_EWALD
+/* Analytical Ewald interaction kernels with twin-range cut-off
+ * NOTE: no legacy kernels with analytical Ewald.
+ */
+#define EL_EWALD_ANA
 #define VDW_CUTOFF_CHECK
 #define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_twin##__VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_EWALD_ANA
+#undef VDW_CUTOFF_CHECK
+#undef NB_KERNEL_FUNC_NAME
+
+/* Tabulated Ewald interaction kernels */
+#define EL_EWALD_TAB
+#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab##__VA_ARGS__
+#include "nbnxn_cuda_kernel_legacy.cuh"
+#include "nbnxn_cuda_kernel.cuh"
+#undef EL_EWALD_TAB
+#undef NB_KERNEL_FUNC_NAME
+
+/* Tabulated Ewald interaction kernels with twin-range cut-off */
+#define EL_EWALD_TAB
+#define VDW_CUTOFF_CHECK
+#define NB_KERNEL_FUNC_NAME(x,...) x##_ewald_tab_twin##__VA_ARGS__
 #include "nbnxn_cuda_kernel_legacy.cuh"
 #include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD
+#undef EL_EWALD_TAB
 #undef VDW_CUTOFF_CHECK
 #undef NB_KERNEL_FUNC_NAME
diff --git a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index 55f67e9288..901e784c32 100644
--- a/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -47,13 +47,25 @@
 extern "C" {
 #endif
 
-/*! Types of electrostatics available in the CUDA nonbonded force kernels. */
+/*! Types of electrostatics implementations available in the CUDA non-bonded
+ *  force kernels. These represent both the electrostatics types implemented
+ *  by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
+ *  enums.h) as well as encode implementation details analytical/tabulated
+ *  and single or twin cut-off (for Ewald kernels).
+ *  Note that the cut-off and RF kernels have only analytical flavor and unlike
+ *  in the CPU kernels, the tabulated kernels are ATM Ewald-only.
+ *
+ *  The order of pointers to different electrostatic kernels defined in
+ *  nbnxn_cuda.cu by the nb_default_kfunc_ptr and nb_legacy_kfunc_ptr arrays
+ *  should match the order of enumerated types below. */
 enum {
-    eelCuEWALD, eelCuEWALD_TWIN, eelCuRF, eelCuCUT, eelCuNR
+    eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
 };
 
+/*! Kernel flavors with different set of optimizations: default for CUDA <=v4.1
+ *  compilers and legacy for earlier, 3.2 and 4.0 CUDA compilers. */
 enum {
-    eNbnxnCuKDefault, eNbnxnCuKLegacy, eNbnxnCuKOld, eNbnxnCuKNR
+    eNbnxnCuKDefault, eNbnxnCuKLegacy, eNbnxnCuKNR
 };
 
 #define NBNXN_KVER_OLD(k)      (k == eNbnxnCuKOld)
-- 
2.22.0