From a8cf8c39f389d529696015272a21cf24acc5bc15 Mon Sep 17 00:00:00 2001
From: Szilard Pall <pall.szilard@gmail.com>
Date: Fri, 10 Jan 2014 12:46:15 +0100
Subject: [PATCH] CUDA kernels for switched LJ

This change adds CUDA GPU kernels for force and potential switching,
implemented in the Verlet scheme for CPUs in f3acaf.

Change-Id: I73a83d352924379d9005104fa59f3478d3e684b0
---
 src/gromacs/mdlib/forcerec.c                  |   6 +-
 src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu    | 132 ++++++++++++-----
 .../mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu  |  27 +++-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh    |  47 ++++--
 .../nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh    | 135 +++++++++++++++++-
 .../mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh   | 127 ++++++++++++++--
 .../mdlib/nbnxn_cuda/nbnxn_cuda_types.h       | 103 +++++++++----
 7 files changed, 474 insertions(+), 103 deletions(-)

diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c
index 954c6a714a..3784522200 100644
--- a/src/gromacs/mdlib/forcerec.c
+++ b/src/gromacs/mdlib/forcerec.c
@@ -1545,11 +1545,9 @@ gmx_bool nbnxn_acceleration_supported(FILE             *fplog,
     /* TODO: remove these GPU specific restrictions by implementing CUDA kernels */
     if (bGPU)
     {
-        if (ir->vdw_modifier == eintmodFORCESWITCH ||
-            ir->vdw_modifier == eintmodPOTSWITCH ||
-            ir->vdwtype == evdwPME)
+        if (ir->vdwtype == evdwPME)
         {
-            md_print_warn(cr, fplog, "LJ switch functions and LJ-PME are not yet supported on the GPU, falling back to CPU only\n");
+            md_print_warn(cr, fplog, "LJ-PME is not yet supported with GPUs, falling back to CPU only\n");
             return FALSE;
         }
     }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 134b43e3a9..7477ac7863 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -143,40 +143,93 @@ static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 /* Constant arrays listing all kernel function pointers and enabling selection
    of a kernel in an elegant manner. */
 
-static const int nEnergyKernelTypes = 2; /* 0 - no energy, 1 - energy */
-static const int nPruneKernelTypes  = 2; /* 0 - no prune, 1 - prune */
-
-/*! Pointers to the default kernels organized in a 3 dim array by:
- *  electrostatics type, energy calculation on/off, and pruning on/off.
+/*! Pointers to the non-bonded kernels organized in 2-dim arrays by:
+ *  electrostatics and VDW type.
  *
- *  Note that the order of electrostatics (1st dimension) has to match the
- *  order of corresponding enumerated types defined in nbnxn_cuda_types.h.
+ *  Note that the row- and column-order of function pointers has to match the
+ *  order of corresponding enumerated electrostatics and vdw types, resp.,
+ *  defined in nbnxn_cuda_types.h.
  */
-static const nbnxn_cu_kfunc_ptr_t
-    nb_default_kfunc_ptr[eelCuNR][nEnergyKernelTypes][nPruneKernelTypes] =
+
+/*! Force-only kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
+{
+    { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,                nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,                    nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,                 nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,                     nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,            nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,                nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda,     nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda,         nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,                 nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,                     nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,          nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,              nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda      }
+};
+
+/*! Force + energy kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
+{
+    { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,                 nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,                 nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,                  nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,                  nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,                  nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,                  nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,           nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,           nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda      }
+};
+
+/*! Force + pruning kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
 {
-    { { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,             nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda },
-      { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda } },
-    { { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,              nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda },
-      { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda } },
-    { { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,         nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda },
-      { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda } },
-    { { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda,  nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda },
-      { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda } },
-    { { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,              nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda },
-      { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda } },
-    { { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,       nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda },
-      { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda } }
+    { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda      }
+};
+
+/*! Force + energy + pruning kernel function pointers. */
+static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
+{
+    { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,           nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda             },
+    { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda              },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda      }
 };
 
 /*! Return a pointer to the kernel version to be executed at the current step. */
 static inline nbnxn_cu_kfunc_ptr_t select_nbnxn_kernel(int  eeltype,
+                                                       int  evdwtype,
                                                        bool bDoEne,
                                                        bool bDoPrune)
 {
+    nbnxn_cu_kfunc_ptr_t res;
+
     assert(eeltype < eelCuNR);
+    assert(evdwtype < eelCuNR);
 
-    return nb_default_kfunc_ptr[eeltype][bDoEne][bDoPrune];
+    if (bDoEne)
+    {
+        if (bDoPrune)
+        {
+            res = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
+        }
+        else
+        {
+            res = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
+        }
+    }
+    else
+    {
+        if (bDoPrune)
+        {
+            res = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
+        }
+        else
+        {
+            res = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
+        }
+    }
+
+    return res;
 }
 
 /*! Calculates the amount of shared memory required by the CUDA kernel in use. */
@@ -302,7 +355,9 @@ void nbnxn_cuda_launch_kernel(nbnxn_cuda_ptr_t        cu_nb,
     }
 
     /* get the pointer to the kernel flavor we need to use */
-    nb_kernel = select_nbnxn_kernel(nbp->eeltype, bCalcEner,
+    nb_kernel = select_nbnxn_kernel(nbp->eeltype,
+                                    nbp->vdwtype,
+                                    bCalcEner,
                                     plist->bDoPrune || always_prune);
 
     /* kernel launch config */
@@ -642,23 +697,26 @@ void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo)
 
     for (int i = 0; i < eelCuNR; i++)
     {
-        for (int j = 0; j < nEnergyKernelTypes; j++)
+        for (int j = 0; j < evdwCuNR; j++)
         {
-            for (int k = 0; k < nPruneKernelTypes; k++)
+            if (devinfo->prop.major >= 3)
+            {
+                /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
+                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferShared);
+                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferShared);
+                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferShared);
+            }
+            else
             {
-                if (devinfo->prop.major >= 3)
-                {
-                    /* Default kernel on sm 3.x 48/16 kB Shared/L1 */
-                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferShared);
-                }
-                else
-                {
-                    /* On Fermi prefer L1 gives 2% higher performance */
-                    /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
-                    stat = cudaFuncSetCacheConfig(nb_default_kfunc_ptr[i][j][k], cudaFuncCachePreferL1);
-                }
-                CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
+                /* On Fermi prefer L1 gives 2% higher performance */
+                /* Default kernel on sm_2.x 16/48 kB Shared/L1 */
+                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_prune_ptr[i][j], cudaFuncCachePreferL1);
+                stat = cudaFuncSetCacheConfig(nb_kfunc_ener_noprune_ptr[i][j], cudaFuncCachePreferL1);
+                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_prune_ptr[i][j], cudaFuncCachePreferL1);
+                stat = cudaFuncSetCacheConfig(nb_kfunc_noener_noprune_ptr[i][j], cudaFuncCachePreferL1);
             }
+            CU_RET_ERR(stat, "cudaFuncSetCacheConfig failed");
         }
     }
 }
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index cae773528a..0fe6e8e275 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -46,6 +46,7 @@
 #include "smalloc.h"
 #include "tables.h"
 #include "typedefs.h"
+#include "types/enums.h"
 #include "types/nb_verlet.h"
 #include "types/interaction_const.h"
 #include "types/force_flags.h"
@@ -277,13 +278,27 @@ static void init_nbparam(cu_nbparam_t              *nbp,
     nbp->rvdw_sq     = ic->rvdw * ic->rvdw;
     nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
     nbp->rlist_sq    = ic->rlist * ic->rlist;
-    nbp->sh_invrc6   = ic->sh_invrc6;
 
-    /* TODO: implemented LJ force- and potential-switch CUDA kernels */
-    if (!(ic->vdw_modifier == eintmodNONE ||
-          ic->vdw_modifier == eintmodPOTSHIFT))
-    {
-        gmx_fatal(FARGS, "The CUDA kernels do not yet support switched LJ interactions");
+    nbp->rvdw_switch      = ic->rvdw_switch;
+    nbp->dispersion_shift = ic->dispersion_shift;
+    nbp->repulsion_shift  = ic->repulsion_shift;
+    nbp->vdw_switch       = ic->vdw_switch;
+
+    switch (ic->vdw_modifier)
+    {
+        case eintmodNONE:
+        case eintmodPOTSHIFT:
+            nbp->vdwtype = evdwCuCUT;
+            break;
+        case eintmodFORCESWITCH:
+            nbp->vdwtype = evdwCuFSWITCH;
+            break;
+        case eintmodPOTSWITCH:
+            nbp->vdwtype = evdwCuPSWITCH;
+            break;
+        default:
+            gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
+            break;
     }
 
     if (ic->eeltype == eelCUT)
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index bc3dd5f43c..1ea1828873 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -112,16 +112,15 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
 
 #ifdef CALC_ENERGIES
-    float  lj_shift    = nbparam.sh_invrc6;
 #ifdef EL_EWALD_ANY
     float  beta        = nbparam.ewald_beta;
     float  ewald_shift = nbparam.sh_ewald;
 #else
     float  c_rf        = nbparam.c_rf;
-#endif
-    float *e_lj       = atdat.e_lj;
-    float *e_el       = atdat.e_el;
-#endif
+#endif /* EL_EWALD_ANY */
+    float *e_lj        = atdat.e_lj;
+    float *e_el        = atdat.e_el;
+#endif /* CALC_ENERGIES */
 
     /* thread/block/warp id-s */
     unsigned int tidxi  = threadIdx.x;
@@ -141,7 +140,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                  int_bit,
                  F_invr;
 #ifdef CALC_ENERGIES
-    float        E_lj, E_el, E_lj_p;
+    float        E_lj, E_el;
+#endif
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+    float        E_lj_p;
 #endif
     unsigned int wexcl, imask, mask_ji;
     float4       xqbuf;
@@ -336,19 +338,38 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
 
                                 F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
+#if defined CALC_ENERGIES || defined LJ_POT_SWITCH
+                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 + nbparam.repulsion_shift.cpot)*ONE_TWELVETH_F -
+                                                     c6 * (inv_r6 + nbparam.dispersion_shift.cpot)*ONE_SIXTH_F);
+#endif
 
+#ifdef LJ_FORCE_SWITCH
 #ifdef CALC_ENERGIES
-                                E_lj_p  = int_bit * (c12 * (inv_r6 * inv_r6 - lj_shift * lj_shift) * 0.08333333f - c6 * (inv_r6 - lj_shift) * 0.16666667f);
-#endif
+                                calculate_force_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_force_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_FORCE_SWITCH */
 
 #ifdef VDW_CUTOFF_CHECK
-                                /* this enables twin-range cut-offs (rvdw < rcoulomb <= rlist) */
-                                vdw_in_range = (r2 < rvdw_sq) ? 1.0f : 0.0f;
-                                F_invr      *= vdw_in_range;
+                                /* Separate VDW cut-off check to enable twin-range cut-offs
+                                 * (rvdw < rcoulomb <= rlist)
+                                 */
+                                vdw_in_range  = (r2 < rvdw_sq) ? 1.0f : 0.0f;
+                                F_invr       *= vdw_in_range;
 #ifdef CALC_ENERGIES
-                                E_lj_p  *= vdw_in_range;
-#endif
+                                E_lj_p       *= vdw_in_range;
 #endif
+#endif                          /* VDW_CUTOFF_CHECK */
+
+#ifdef LJ_POT_SWITCH
+#ifdef CALC_ENERGIES
+                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#else
+                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+#endif /* CALC_ENERGIES */
+#endif /* LJ_POT_SWITCH */
+
 #ifdef CALC_ENERGIES
                                 E_lj    += E_lj_p;
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index bf7c8cfacf..00b6d47115 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,  by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -48,9 +48,142 @@
 #define CL_SIZE_SQ                  (CL_SIZE * CL_SIZE)
 #define FBUF_STRIDE                 (CL_SIZE_SQ)
 
+#define ONE_SIXTH_F     0.16666667f
+#define ONE_TWELVETH_F  0.08333333f
+
+
 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
 const unsigned supercl_interaction_mask = ((1U << NCL_PER_SUPERCL) - 1U);
 
+/*! Apply force switch,  force + energy version. */
+static inline __device__
+void calculate_force_switch_F(const  cu_nbparam_t nbparam,
+                              float               c6,
+                              float               c12,
+                              float               inv_r,
+                              float               r2,
+                              float              *F_invr)
+{
+    float r, r_switch;
+
+    /* force switch constants */
+    float disp_shift_V2 = nbparam.dispersion_shift.c2;
+    float disp_shift_V3 = nbparam.dispersion_shift.c3;
+    float repu_shift_V2 = nbparam.repulsion_shift.c2;
+    float repu_shift_V3 = nbparam.repulsion_shift.c3;
+
+    r         = r2 * inv_r;
+    r_switch  = r - nbparam.rvdw_switch;
+    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    *F_invr  +=
+        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+        c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+}
+
+/*! Apply force switch, force-only version. */
+static inline __device__
+void calculate_force_switch_F_E(const  cu_nbparam_t nbparam,
+                                float               c6,
+                                float               c12,
+                                float               inv_r,
+                                float               r2,
+                                float              *F_invr,
+                                float              *E_lj)
+{
+    float r, r_switch;
+
+    /* force switch constants */
+    float disp_shift_V2 = nbparam.dispersion_shift.c2;
+    float disp_shift_V3 = nbparam.dispersion_shift.c3;
+    float repu_shift_V2 = nbparam.repulsion_shift.c2;
+    float repu_shift_V3 = nbparam.repulsion_shift.c3;
+
+    float disp_shift_F2 = nbparam.dispersion_shift.c2/3;
+    float disp_shift_F3 = nbparam.dispersion_shift.c3/4;
+    float repu_shift_F2 = nbparam.repulsion_shift.c2/3;
+    float repu_shift_F3 = nbparam.repulsion_shift.c3/4;
+
+    r         = r2 * inv_r;
+    r_switch  = r - nbparam.rvdw_switch;
+    r_switch  = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    *F_invr  +=
+        -c6*(disp_shift_V2 + disp_shift_V3*r_switch)*r_switch*r_switch*inv_r +
+        c12*(-repu_shift_V2 + repu_shift_V3*r_switch)*r_switch*r_switch*inv_r;
+    *E_lj    +=
+        c6*(disp_shift_F2 + disp_shift_F3*r_switch)*r_switch*r_switch*r_switch -
+        c12*(repu_shift_F2 + repu_shift_F3*r_switch)*r_switch*r_switch*r_switch;
+}
+
+/*! Apply potential switch, force-only version. */
+static inline __device__
+void calculate_potential_switch_F(const  cu_nbparam_t nbparam,
+                                  float               c6,
+                                  float               c12,
+                                  float               inv_r,
+                                  float               r2,
+                                  float              *F_invr,
+                                  float              *E_lj)
+{
+    float r, r_switch;
+    float sw, dsw;
+
+    /* potential switch constants */
+    float switch_V3 = nbparam.vdw_switch.c3;
+    float switch_V4 = nbparam.vdw_switch.c4;
+    float switch_V5 = nbparam.vdw_switch.c5;
+    float switch_F2 = 3*nbparam.vdw_switch.c3;
+    float switch_F3 = 4*nbparam.vdw_switch.c4;
+    float switch_F4 = 5*nbparam.vdw_switch.c5;
+
+    r        = r2 * inv_r;
+    r_switch = r - nbparam.rvdw_switch;
+
+    /* Unlike in the F+E kernel, conditional is faster here */
+    if (r_switch > 0.0f)
+    {
+        sw      = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+        dsw     = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+        *F_invr = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+    }
+}
+
+/*! Apply potential switch, force + energy version. */
+static inline __device__
+void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
+                                    float               c6,
+                                    float               c12,
+                                    float               inv_r,
+                                    float               r2,
+                                    float              *F_invr,
+                                    float              *E_lj)
+{
+    float r, r_switch;
+    float sw, dsw;
+
+    /* potential switch constants */
+    float switch_V3 = nbparam.vdw_switch.c3;
+    float switch_V4 = nbparam.vdw_switch.c4;
+    float switch_V5 = nbparam.vdw_switch.c5;
+    float switch_F2 = 3*nbparam.vdw_switch.c3;
+    float switch_F3 = 4*nbparam.vdw_switch.c4;
+    float switch_F4 = 5*nbparam.vdw_switch.c5;
+
+    r        = r2 * inv_r;
+    r_switch = r - nbparam.rvdw_switch;
+    r_switch = r_switch >= 0.0f ? r_switch : 0.0f;
+
+    /* Unlike in the F-only kernel, masking is faster here */
+    sw       = 1.0f + (switch_V3 + (switch_V4 + switch_V5*r_switch)*r_switch)*r_switch*r_switch*r_switch;
+    dsw      = (switch_F2 + (switch_F3 + switch_F4*r_switch)*r_switch)*r_switch*r_switch;
+
+    *F_invr  = (*F_invr)*sw - inv_r*(*E_lj)*dsw;
+    *E_lj   *= sw;
+}
+
+
 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  *  Original idea: from the OpenMM project
  */
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index 8f8c53efe7..555ec57325 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -34,8 +34,9 @@
  */
 
 /*! \internal \file
- *  This header has the sole purpose of generating kernels for the supported
- *  electrostatics types: cut-off, reaction-field, Ewald, and tabulated Ewald.
+ *  This header has the sole purpose of generating kernels for the combinations of
+ *  supported electrostatics types (cut-off, reaction-field, analytical and
+ *  tabulated Ewald) and VDW types ( V shift, F switch, V swtich).
  *
  *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
  *  require an extra distance check to enable  PP-PME load balancing
@@ -44,50 +45,148 @@
  *  NOTE: No include fence as it is meant to be included multiple times.
  */
 
-/* Analytical plain cut-off kernels */
+/* Analytical plain cut-off electrostatics kernels
+ */
 #define EL_CUTOFF
+
+/* V shift */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
-#undef EL_CUTOFF
+#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
-/* Analytical reaction-field kernels */
+#undef EL_CUTOFF
+
+/* Analytical reaction-field kernels
+ */
 #define EL_RF
+
+/* V shift */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
-#undef EL_RF
 #undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_RF
+
 
 /* Analytical Ewald interaction kernels
  */
 #define EL_EWALD_ANA
+
+/* V shift */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD_ANA
 #undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_ANA
+
+
 
 /* Analytical Ewald interaction kernels with twin-range cut-off
  */
 #define EL_EWALD_ANA
-#define VDW_CUTOFF_CHECK
+#define LJ_CUTOFF_CHECK
+
+/* V shift */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD_ANA
-#undef VDW_CUTOFF_CHECK
+#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
+#undef EL_EWALD_ANA
+#undef LJ_CUTOFF_CHECK
+
+
+
 /* Tabulated Ewald interaction kernels */
 #define EL_EWALD_TAB
+
+/* V shift */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD_TAB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
 #undef NB_KERNEL_FUNC_NAME
 
+#undef EL_EWALD_TAB
+
+
 /* Tabulated Ewald interaction kernels with twin-range cut-off */
 #define EL_EWALD_TAB
-#define VDW_CUTOFF_CHECK
+#define LJ_CUTOFF_CHECK
+
+/* V shift */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
-#undef EL_EWALD_TAB
-#undef VDW_CUTOFF_CHECK
 #undef NB_KERNEL_FUNC_NAME
+/* F switch */
+#define LJ_FORCE_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_FORCE_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+/* V switch */
+#define LJ_POT_SWITCH
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_POT_SWITCH
+#undef NB_KERNEL_FUNC_NAME
+
+#undef EL_EWALD_TAB
+#undef LJ_CUTOFF_CHECK
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index f8d7a8d076..80d75cba99 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1991-2000, University of Groningen, The Netherlands.
  * Copyright (c) 2001-2012, The GROMACS development team.
- * Copyright (c) 2012,2013, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -35,9 +35,18 @@
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
+/*! \internal \file
+ *  \brief
+ *  Data types used internally in the nbnxn_cuda module.
+ *
+ *  \author SzilÃ¡rd PÃ¡ll <pall.szilard@gmail.com>
+ *  \ingroup module_mdlib
+ */
+
 #ifndef NBNXN_CUDA_TYPES_H
 #define NBNXN_CUDA_TYPES_H
 
+#include "types/interaction_const.h"
 #include "types/nbnxn_pairlist.h"
 #include "types/nbnxn_cuda_types_ext.h"
 #include "../../gmxlib/cuda_tools/cudautils.cuh"
@@ -46,7 +55,7 @@
 #if CUDA_VERSION >= 5000
 #define TEXOBJ_SUPPORTED
 #else  /* CUDA_VERSION */
-/* This typedef allows us to define only one version of struct cu_nbparam */
+/*! This typedef allows us to define only one version of struct cu_nbparam */
 typedef int cudaTextureObject_t;
 #endif /* CUDA_VERSION */
 
@@ -54,7 +63,9 @@ typedef int cudaTextureObject_t;
 extern "C" {
 #endif
 
-/** Types of electrostatics implementations available in the CUDA non-bonded
+/*! \brief Electrostatic CUDA kernel flavors.
+ *
+ *  Types of electrostatics implementations available in the CUDA non-bonded
  *  force kernels. These represent both the electrostatics types implemented
  *  by the kernels (cut-off, RF, and Ewald - a subset of what's defined in
  *  enums.h) as well as encode implementation details analytical/tabulated
@@ -62,24 +73,43 @@ extern "C" {
  *  Note that the cut-off and RF kernels have only analytical flavor and unlike
  *  in the CPU kernels, the tabulated kernels are ATM Ewald-only.
  *
- *  The order of pointers to different electrostatic kernels defined in
- *  nbnxn_cuda.cu by the nb_default_kfunc_ptr array
- *  should match the order of enumerated types below. */
-enum {
+ *  The row-order of pointers to different electrostatic kernels defined in
+ *  nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ *  should match the order of enumerated types below.
+ */
+enum eelCu {
     eelCuCUT, eelCuRF, eelCuEWALD_TAB, eelCuEWALD_TAB_TWIN, eelCuEWALD_ANA, eelCuEWALD_ANA_TWIN, eelCuNR
 };
 
+/*! \brief VdW CUDA kernel flavors.
+ *
+ * The enumerates values correspond to the LJ implementations in the CUDA non-bonded
+ * kernels.
+ *
+ * The column-order of pointers to different electrostatic kernels defined in
+ * nbnxn_cuda.cu by the nb_*_kfunc_ptr function pointer table
+ * should match the order of enumerated types below.
+ */
+enum evdwCu {
+    evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuNR
+};
+
 /* All structs prefixed with "cu_" hold data used in GPU calculations and
  * are passed to the kernels, except cu_timers_t. */
+/*! \cond */
 typedef struct cu_plist     cu_plist_t;
 typedef struct cu_atomdata  cu_atomdata_t;
 typedef struct cu_nbparam   cu_nbparam_t;
 typedef struct cu_timers    cu_timers_t;
 typedef struct nb_staging   nb_staging_t;
+/*! \endcond */
 
 
-/** Staging area for temporary data. The energies get downloaded here first,
- *  before getting added to the CPU-side aggregate values.
+/** \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ *  The energies/shift forces get downloaded here first, before getting added
+ *  to the CPU-side aggregate values.
  */
 struct nb_staging
 {
@@ -88,7 +118,9 @@ struct nb_staging
     float3  *fshift;    /**< shift forces         */
 };
 
-/** Nonbonded atom data -- both inputs and outputs. */
+/** \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
 struct cu_atomdata
 {
     int      natoms;            /**< number of atoms                              */
@@ -97,9 +129,9 @@ struct cu_atomdata
 
     float4  *xq;                /**< atom coordinates + charges, size natoms      */
     float3  *f;                 /**< force output array, size natoms              */
-    /* TODO: try float2 for the energies */
-    float   *e_lj,              /**< LJ energy output, size 1                     */
-            *e_el;              /**< Electrostatics energy input, size 1          */
+
+    float   *e_lj;              /**< LJ energy output, size 1                     */
+    float   *e_el;              /**< Electrostatics energy input, size 1          */
 
     float3  *fshift;            /**< shift forces                                 */
 
@@ -110,20 +142,29 @@ struct cu_atomdata
     bool     bShiftVecUploaded; /**< true if the shift vector has been uploaded   */
 };
 
-/** Parameters required for the CUDA nonbonded calculations. */
+/** \internal
+ * \brief Parameters required for the CUDA nonbonded calculations.
+ */
 struct cu_nbparam
 {
-    int      eeltype;        /**< type of electrostatics                            */
-
-    float    epsfac;         /**< charge multiplication factor                      */
-    float    c_rf;           /**< Reaction-field/plain cutoff electrostatics const. */
-    float    two_k_rf;       /**< Reaction-field electrostatics constant            */
-    float    ewald_beta;     /**< Ewald/PME parameter                               */
-    float    sh_ewald;       /**< Ewald/PME  correction term                        */
-    float    rvdw_sq;        /**< VdW cut-off                                       */
-    float    rcoulomb_sq;    /**< Coulomb cut-off                                   */
-    float    rlist_sq;       /**< pair-list cut-off                                 */
-    float    sh_invrc6;      /**< LJ potential correction term                      */
+
+    int             eeltype;          /**< type of electrostatics, takes values from #eelCu */
+    int             vdwtype;          /**< type of VdW impl., takes values from #evdwCu     */
+
+    float           epsfac;           /**< charge multiplication factor                      */
+    float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
+    float           two_k_rf;         /**< Reaction-field electrostatics constant            */
+    float           ewald_beta;       /**< Ewald/PME parameter                               */
+    float           sh_ewald;         /**< Ewald/PME  correction term                        */
+    float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
+
+    float           rvdw_sq;          /**< VdW cut-off squared                               */
+    float           rvdw_switch;      /**< VdW switched cut-off                              */
+    float           rlist_sq;         /**< pair-list cut-off squared                         */
+
+    shift_consts_t  dispersion_shift; /**< VdW shift dispersion constants           */
+    shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
+    switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 
     /* Non-bonded parameters - accessed through texture memory */
     float              *nbfp;        /**< nonbonded parameter table with C6/C12 pairs  */
@@ -136,7 +177,9 @@ struct cu_nbparam
     cudaTextureObject_t  coulomb_tab_texobj; /**< texture object bound to coulomb_tab        */
 };
 
-/** Pair list data */
+/** \internal
+ * \brief Pair list data.
+ */
 struct cu_plist
 {
     int              na_c;        /**< number of atoms per cluster                  */
@@ -157,7 +200,9 @@ struct cu_plist
                                        done during the  current step                */
 };
 
-/** CUDA events used for timing GPU kernels and H2D/D2H transfers.
+/** \internal
+ * \brief CUDA events used for timing GPU kernels and H2D/D2H transfers.
+ *
  * The two-sized arrays hold the local and non-local values and should always
  * be indexed with eintLocal/eintNonlocal.
  */
@@ -175,7 +220,9 @@ struct cu_timers
     cudaEvent_t stop_nb_k[2];    /**< stop event non-bonded kernels (l/nl, every step)               */
 };
 
-/** Main data structure for CUDA nonbonded force calculations. */
+/** \internal
+ * \brief Main data structure for CUDA nonbonded force calculations.
+ */
 struct nbnxn_cuda
 {
     cuda_dev_info_t *dev_info;       /**< CUDA device information                              */
-- 
2.22.0