From: Szilard Pall <pall.szilard@gmail.com>
Date: Tue, 25 Feb 2014 20:39:07 +0000 (+0100)
Subject: Added CUDA LJ-PME nbnxn kernels
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=d72a25ed4b53c22c8cd0d653f1e8b457b96b3c60;p=alexxy%2Fgromacs.git

Added CUDA LJ-PME nbnxn kernels

This change implements CUDA non-bonded kernels for LJ-PME introduced
in the Verlet scheme with 99029d.

The CUDA kernels implement geometric as well as Lorentz-Berthelot (LB)
combinations rules (unlike the CPU SIMD) mostly because even though PME
is very slow with LB, it is still beneficial to let the user offload the
non-bondeds to a GPU and potentially bump up the cut-off to further
reduce the CPU PME load.

Note that as now we have 120 kernels compiled for up to four different
target architectures, the nbnxn_cuda module takes a very long time to
build and can become the bottleneck during compilation. We will deal
with this later.

Change-Id: I819b59a8948da0c8492eac6a43d4a7fb6dc98354
---

diff --git a/src/gromacs/mdlib/forcerec.c b/src/gromacs/mdlib/forcerec.c
index 3784522200..95fbb19bab 100644
--- a/src/gromacs/mdlib/forcerec.c
+++ b/src/gromacs/mdlib/forcerec.c
@@ -1542,17 +1542,7 @@ gmx_bool nbnxn_acceleration_supported(FILE             *fplog,
                                       const t_inputrec *ir,
                                       gmx_bool          bGPU)
 {
-    /* TODO: remove these GPU specific restrictions by implementing CUDA kernels */
-    if (bGPU)
-    {
-        if (ir->vdwtype == evdwPME)
-        {
-            md_print_warn(cr, fplog, "LJ-PME is not yet supported with GPUs, falling back to CPU only\n");
-            return FALSE;
-        }
-    }
-
-    if (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB)
+    if (!bGPU && (ir->vdwtype == evdwPME && ir->ljpme_combination_rule == eljpmeLB))
     {
         md_print_warn(cr, fplog, "LJ-PME with Lorentz-Berthelot is not supported with %s, falling back to %s\n",
                       bGPU ? "GPUs" : "SIMD kernels",
diff --git a/src/gromacs/mdlib/nbnxn_atomdata.c b/src/gromacs/mdlib/nbnxn_atomdata.c
index ba0b9a5bbc..d780c21331 100644
--- a/src/gromacs/mdlib/nbnxn_atomdata.c
+++ b/src/gromacs/mdlib/nbnxn_atomdata.c
@@ -360,32 +360,39 @@ void copy_rvec_to_nbat_real(const int *a, int na, int na_round,
     }
 }
 
-/* Stores the LJ parameter data in a format convenient for the SIMD kernels */
-static void set_ljparam_simd_data(nbnxn_atomdata_t *nbat)
+/* Stores the LJ parameter data in a format convenient for different kernels */
+static void set_lj_parameter_data(nbnxn_atomdata_t *nbat, gmx_bool bSIMD)
 {
     int  nt, i, j;
     real c6, c12;
 
     nt = nbat->ntype;
 
-    /* nbfp_s4 stores two parameters using a stride of 4,
-     * because this would suit x86 SIMD single-precision
-     * quad-load intrinsics. There's a slight inefficiency in
-     * allocating and initializing nbfp_s4 when it might not
-     * be used, but introducing the conditional code is not
-     * really worth it. */
-    nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
-    for (i = 0; i < nt; i++)
+    if (bSIMD)
     {
-        for (j = 0; j < nt; j++)
+        /* nbfp_s4 stores two parameters using a stride of 4,
+         * because this would suit x86 SIMD single-precision
+         * quad-load intrinsics. There's a slight inefficiency in
+         * allocating and initializing nbfp_s4 when it might not
+         * be used, but introducing the conditional code is not
+         * really worth it. */
+        nbat->alloc((void **)&nbat->nbfp_s4, nt*nt*4*sizeof(*nbat->nbfp_s4));
+        for (i = 0; i < nt; i++)
         {
-            nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
-            nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
-            nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
-            nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+            for (j = 0; j < nt; j++)
+            {
+                nbat->nbfp_s4[(i*nt+j)*4+0] = nbat->nbfp[(i*nt+j)*2+0];
+                nbat->nbfp_s4[(i*nt+j)*4+1] = nbat->nbfp[(i*nt+j)*2+1];
+                nbat->nbfp_s4[(i*nt+j)*4+2] = 0;
+                nbat->nbfp_s4[(i*nt+j)*4+3] = 0;
+            }
         }
     }
 
+    /* We use combination rule data for SIMD combination rule kernels
+     * and with LJ-PME kernels. We then only need parameters per atom type,
+     * not per pair of atom types.
+     */
     switch (nbat->comb_rule)
     {
         case ljcrGEOM:
@@ -393,7 +400,7 @@ static void set_ljparam_simd_data(nbnxn_atomdata_t *nbat)
 
             for (i = 0; i < nt; i++)
             {
-                /* Copy the diagonal from the nbfp matrix */
+                /* Store the sqrt of the diagonal from the nbfp matrix */
                 nbat->nbfp_comb[i*2  ] = sqrt(nbat->nbfp[(i*nt+i)*2  ]);
                 nbat->nbfp_comb[i*2+1] = sqrt(nbat->nbfp[(i*nt+i)*2+1]);
             }
@@ -527,7 +534,7 @@ void nbnxn_atomdata_init(FILE *fp,
     int      i, j, nth;
     real     c6, c12, tol;
     char    *ptr;
-    gmx_bool simple, bCombGeom, bCombLB;
+    gmx_bool simple, bCombGeom, bCombLB, bSIMD;
 
     if (alloc == NULL)
     {
@@ -688,10 +695,10 @@ void nbnxn_atomdata_init(FILE *fp,
             gmx_incons("Unknown enbnxninitcombrule");
     }
 
-    if (simple)
-    {
-        set_ljparam_simd_data(nbat);
-    }
+    bSIMD = (nb_kernel_type == nbnxnk4xN_SIMD_4xN ||
+             nb_kernel_type == nbnxnk4xN_SIMD_2xNN);
+
+    set_lj_parameter_data(nbat, bSIMD);
 
     nbat->natoms  = 0;
     nbat->type    = NULL;
@@ -700,27 +707,25 @@ void nbnxn_atomdata_init(FILE *fp,
     {
         int pack_x;
 
-        switch (nb_kernel_type)
+        if (bSIMD)
         {
-            case nbnxnk4xN_SIMD_4xN:
-            case nbnxnk4xN_SIMD_2xNN:
-                pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
-                             nbnxn_kernel_to_cj_size(nb_kernel_type));
-                switch (pack_x)
-                {
-                    case 4:
-                        nbat->XFormat = nbatX4;
-                        break;
-                    case 8:
-                        nbat->XFormat = nbatX8;
-                        break;
-                    default:
-                        gmx_incons("Unsupported packing width");
-                }
-                break;
-            default:
-                nbat->XFormat = nbatXYZ;
-                break;
+            pack_x = max(NBNXN_CPU_CLUSTER_I_SIZE,
+                         nbnxn_kernel_to_cj_size(nb_kernel_type));
+            switch (pack_x)
+            {
+                case 4:
+                    nbat->XFormat = nbatX4;
+                    break;
+                case 8:
+                    nbat->XFormat = nbatX8;
+                    break;
+                default:
+                    gmx_incons("Unsupported packing width");
+            }
+        }
+        else
+        {
+            nbat->XFormat = nbatXYZ;
         }
 
         nbat->FFormat = nbat->XFormat;
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index 7477ac7863..2821e54cbd 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -62,9 +62,12 @@
 #define USE_TEXOBJ
 #endif
 
-/*! Texture reference for nonbonded parameters; bound to cu_nbparam_t.nbfp*/
+/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
 texture<float, 1, cudaReadModeElementType> nbfp_texref;
 
+/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
+texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
+
 /*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
 texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
 
@@ -154,45 +157,45 @@ static inline int calc_nb_kernel_nblock(int nwork_units, cuda_dev_info_t *dinfo)
 /*! Force-only kernel function pointers. */
 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_noprune_ptr[eelCuNR][evdwCuNR] =
 {
-    { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,                nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,                    nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda            },
-    { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,                 nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,                     nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda             },
-    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,            nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,                nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda        },
-    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda,     nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda,         nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda },
-    { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,                 nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,                     nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda             },
-    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,          nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,              nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda      }
+    { nbnxn_kernel_ElecCut_VdwLJ_F_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_F_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_F_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_cuda      }
 };
 
 /*! Force + energy kernel function pointers. */
 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_noprune_ptr[eelCuNR][evdwCuNR] =
 {
-    { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,                 nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,                 nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda            },
-    { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,                  nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,                  nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda             },
-    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda        },
-    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda },
-    { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,                  nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,                  nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda             },
-    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,           nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,           nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda      }
+    { nbnxn_kernel_ElecCut_VdwLJ_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_cuda              },
+    { nbnxn_kernel_ElecRF_VdwLJ_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_cuda               },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_cuda          },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_cuda     },
+    { nbnxn_kernel_ElecEw_VdwLJ_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_cuda               },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_cuda        }
 };
 
 /*! Force + pruning kernel function pointers. */
 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_noener_prune_ptr[eelCuNR][evdwCuNR] =
 {
-    { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda            },
-    { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda             },
-    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda        },
-    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda },
-    { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda             },
-    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda      }
+    { nbnxn_kernel_ElecCut_VdwLJ_F_prune_cuda,             nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_cuda,         nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_cuda,  nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_F_prune_cuda,              nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_cuda,       nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_cuda      }
 };
 
 /*! Force + energy + pruning kernel function pointers. */
 static const nbnxn_cu_kfunc_ptr_t nb_kfunc_ener_prune_ptr[eelCuNR][evdwCuNR] =
 {
-    { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,           nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda             },
-    { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda              },
-    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda        },
-    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda },
-    { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda             },
-    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda      }
+    { nbnxn_kernel_ElecCut_VdwLJ_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_cuda,            nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_cuda            },
+    { nbnxn_kernel_ElecRF_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_cuda             },
+    { nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_cuda,        nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_cuda        },
+    { nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_cuda, nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_cuda },
+    { nbnxn_kernel_ElecEw_VdwLJ_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_cuda,             nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_cuda             },
+    { nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_cuda,      nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_cuda      }
 };
 
 /*! Return a pointer to the kernel version to be executed at the current step. */
@@ -683,6 +686,12 @@ const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_tex
     return nbfp_texref;
 }
 
+/*! Return the reference to the nbfp_comb texture. */
+const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref()
+{
+    return nbfp_comb_texref;
+}
+
 /*! Return the reference to the coulomb_tab. */
 const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref()
 {
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
index 0fe6e8e275..dfdafbb2fd 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_data_mgmt.cu
@@ -72,6 +72,7 @@ static unsigned int gpu_min_ci_balanced_factor = 40;
 /* Functions from nbnxn_cuda.cu */
 extern void nbnxn_cuda_set_cacheconfig(cuda_dev_info_t *devinfo);
 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_texref();
+extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_nbfp_comb_texref();
 extern const struct texture<float, 1, cudaReadModeElementType> &nbnxn_cuda_get_coulomb_tab_texref();
 
 /* We should actually be using md_print_warn in md_logging.c,
@@ -266,39 +267,62 @@ static void init_nbparam(cu_nbparam_t              *nbp,
                          const cuda_dev_info_t     *dev_info)
 {
     cudaError_t stat;
-    int         ntypes, nnbfp;
+    int         ntypes, nnbfp, nnbfp_comb;
 
     ntypes  = nbat->ntype;
 
-    nbp->ewald_beta  = ic->ewaldcoeff_q;
-    nbp->sh_ewald    = ic->sh_ewald;
-    nbp->epsfac      = ic->epsfac;
-    nbp->two_k_rf    = 2.0 * ic->k_rf;
-    nbp->c_rf        = ic->c_rf;
-    nbp->rvdw_sq     = ic->rvdw * ic->rvdw;
-    nbp->rcoulomb_sq = ic->rcoulomb * ic->rcoulomb;
-    nbp->rlist_sq    = ic->rlist * ic->rlist;
+    nbp->ewald_beta       = ic->ewaldcoeff_q;
+    nbp->sh_ewald         = ic->sh_ewald;
+    nbp->epsfac           = ic->epsfac;
+    nbp->two_k_rf         = 2.0 * ic->k_rf;
+    nbp->c_rf             = ic->c_rf;
+    nbp->rvdw_sq          = ic->rvdw * ic->rvdw;
+    nbp->rcoulomb_sq      = ic->rcoulomb * ic->rcoulomb;
+    nbp->rlist_sq         = ic->rlist * ic->rlist;
+
+    nbp->sh_lj_ewald      = ic->sh_lj_ewald;
+    nbp->ewaldcoeff_lj    = ic->ewaldcoeff_lj;
 
     nbp->rvdw_switch      = ic->rvdw_switch;
     nbp->dispersion_shift = ic->dispersion_shift;
     nbp->repulsion_shift  = ic->repulsion_shift;
     nbp->vdw_switch       = ic->vdw_switch;
 
-    switch (ic->vdw_modifier)
+    if (ic->vdwtype == evdwCUT)
+    {
+        switch (ic->vdw_modifier)
+        {
+            case eintmodNONE:
+            case eintmodPOTSHIFT:
+                nbp->vdwtype = evdwCuCUT;
+                break;
+            case eintmodFORCESWITCH:
+                nbp->vdwtype = evdwCuFSWITCH;
+                break;
+            case eintmodPOTSWITCH:
+                nbp->vdwtype = evdwCuPSWITCH;
+                break;
+            default:
+                gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
+                break;
+        }
+    }
+    else if (ic->vdwtype == evdwPME)
+    {
+        if (ic->ljpme_comb_rule == ljcrGEOM)
+        {
+            assert(nbat->comb_rule == ljcrGEOM);
+            nbp->vdwtype = evdwCuEWALDGEOM;
+        }
+        else
+        {
+            assert(nbat->comb_rule == ljcrLB);
+            nbp->vdwtype = evdwCuEWALDLB;
+        }
+    }
+    else
     {
-        case eintmodNONE:
-        case eintmodPOTSHIFT:
-            nbp->vdwtype = evdwCuCUT;
-            break;
-        case eintmodFORCESWITCH:
-            nbp->vdwtype = evdwCuFSWITCH;
-            break;
-        case eintmodPOTSWITCH:
-            nbp->vdwtype = evdwCuPSWITCH;
-            break;
-        default:
-            gmx_incons("The requested VdW interaction modifier is not implemented in the CUDA GPU accelerated kernels!");
-            break;
+        gmx_incons("The requested VdW type is not implemented in the CUDA GPU accelerated kernels!");
     }
 
     if (ic->eeltype == eelCUT)
@@ -327,16 +351,28 @@ static void init_nbparam(cu_nbparam_t              *nbp,
         init_ewald_coulomb_force_table(nbp, dev_info);
     }
 
-    nnbfp = 2*ntypes*ntypes;
+    nnbfp      = 2*ntypes*ntypes;
+    nnbfp_comb = 2*ntypes;
+
     stat  = cudaMalloc((void **)&nbp->nbfp, nnbfp*sizeof(*nbp->nbfp));
     CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp");
     cu_copy_H2D(nbp->nbfp, nbat->nbfp, nnbfp*sizeof(*nbp->nbfp));
 
+
+    if (ic->vdwtype == evdwPME)
+    {
+        stat  = cudaMalloc((void **)&nbp->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+        CU_RET_ERR(stat, "cudaMalloc failed on nbp->nbfp_comb");
+        cu_copy_H2D(nbp->nbfp_comb, nbat->nbfp_comb, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+    }
+
 #ifdef TEXOBJ_SUPPORTED
     /* Only device CC >= 3.0 (Kepler and later) support texture objects */
     if (dev_info->prop.major >= 3)
     {
         cudaResourceDesc rd;
+        cudaTextureDesc  td;
+
         memset(&rd, 0, sizeof(rd));
         rd.resType                  = cudaResourceTypeLinear;
         rd.res.linear.devPtr        = nbp->nbfp;
@@ -344,11 +380,25 @@ static void init_nbparam(cu_nbparam_t              *nbp,
         rd.res.linear.desc.x        = 32;
         rd.res.linear.sizeInBytes   = nnbfp*sizeof(*nbp->nbfp);
 
-        cudaTextureDesc td;
         memset(&td, 0, sizeof(td));
         td.readMode                 = cudaReadModeElementType;
         stat = cudaCreateTextureObject(&nbp->nbfp_texobj, &rd, &td, NULL);
         CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_texobj failed");
+
+        if (ic->vdwtype == evdwPME)
+        {
+            memset(&rd, 0, sizeof(rd));
+            rd.resType                  = cudaResourceTypeLinear;
+            rd.res.linear.devPtr        = nbp->nbfp_comb;
+            rd.res.linear.desc.f        = cudaChannelFormatKindFloat;
+            rd.res.linear.desc.x        = 32;
+            rd.res.linear.sizeInBytes   = nnbfp_comb*sizeof(*nbp->nbfp_comb);
+
+            memset(&td, 0, sizeof(td));
+            td.readMode = cudaReadModeElementType;
+            stat        = cudaCreateTextureObject(&nbp->nbfp_comb_texobj, &rd, &td, NULL);
+            CU_RET_ERR(stat, "cudaCreateTextureObject on nbfp_comb_texobj failed");
+        }
     }
     else
 #endif
@@ -357,6 +407,13 @@ static void init_nbparam(cu_nbparam_t              *nbp,
         stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_texref(),
                                nbp->nbfp, &cd, nnbfp*sizeof(*nbp->nbfp));
         CU_RET_ERR(stat, "cudaBindTexture on nbfp_texref failed");
+
+        if (ic->vdwtype == evdwPME)
+        {
+            stat = cudaBindTexture(NULL, &nbnxn_cuda_get_nbfp_comb_texref(),
+                                   nbp->nbfp_comb, &cd, nnbfp_comb*sizeof(*nbp->nbfp_comb));
+            CU_RET_ERR(stat, "cudaBindTexture on nbfp_comb_texref failed");
+        }
     }
 }
 
@@ -942,6 +999,24 @@ void nbnxn_cuda_free(nbnxn_cuda_ptr_t cu_nb)
     }
     cu_free_buffered(nbparam->nbfp);
 
+    if (nbparam->vdwtype == evdwCuEWALDGEOM || nbparam->vdwtype == evdwCuEWALDLB)
+    {
+#ifdef TEXOBJ_SUPPORTED
+        /* Only device CC >= 3.0 (Kepler and later) support texture objects */
+        if (cu_nb->dev_info->prop.major >= 3)
+        {
+            stat = cudaDestroyTextureObject(nbparam->nbfp_comb_texobj);
+            CU_RET_ERR(stat, "cudaDestroyTextureObject on nbfp_comb_texobj failed");
+        }
+        else
+#endif
+        {
+            stat = cudaUnbindTexture(nbnxn_cuda_get_nbfp_comb_texref());
+            CU_RET_ERR(stat, "cudaUnbindTexture on nbfp_comb_texref failed");
+        }
+        cu_free_buffered(nbparam->nbfp_comb);
+    }
+
     stat = cudaFree(atdat->shift_vec);
     CU_RET_ERR(stat, "cudaFree failed on atdat->shift_vec");
     stat = cudaFree(atdat->fshift);
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
index 1ea1828873..2d813ec61a 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -51,6 +51,20 @@
 #define EL_EWALD_ANY
 #endif
 
+#if defined EL_EWALD_ANY || defined EL_RF || defined LJ_EWALD
+/* Macro to control the calculation of exclusion forces in the kernel
+ * We do that with Ewald (elec/vdw) and RF.
+ *
+ * Note: convenience macro, needs to be undef-ed at the end of the file.
+ */
+#define EXCLUSION_FORCES
+#endif
+
+#if defined LJ_EWALD_COMB_GEOM || defined LJ_EWALD_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_EWALD
+#endif
+
 /*
    Kernel launch parameters:
     - #blocks   = #pair lists, blockId = pair list Id
@@ -97,6 +111,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
     float               rvdw_sq     = nbparam.rvdw_sq;
     float               vdw_in_range;
 #endif
+#ifdef LJ_EWALD
+    float               lje_coeff2, lje_coeff6_6;
+#endif
 #ifdef EL_RF
     float two_k_rf              = nbparam.two_k_rf;
 #endif
@@ -192,29 +209,56 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
         fci_buf[ci_offset] = make_float3(0.0f);
     }
 
+#ifdef LJ_EWALD
+    /* TODO: we are trading registers with flops by keeping lje_coeff-s, try re-calculating it later */
+    lje_coeff2   = nbparam.ewaldcoeff_lj*nbparam.ewaldcoeff_lj;
+    lje_coeff6_6 = lje_coeff2*lje_coeff2*lje_coeff2*ONE_SIXTH_F;
+#endif /* LJ_EWALD */
+
+
 #ifdef CALC_ENERGIES
     E_lj = 0.0f;
     E_el = 0.0f;
 
-#if defined EL_EWALD_ANY || defined EL_RF
+#if defined EXCLUSION_FORCES /* Ewald or RF */
     if (nb_sci.shift == CENTRAL && pl_cj4[cij4_start].cj[0] == sci*NCL_PER_SUPERCL)
     {
-        /* we have the diagonal: add the charge self interaction energy term */
+        /* we have the diagonal: add the charge and LJ self interaction energy term */
         for (i = 0; i < NCL_PER_SUPERCL; i++)
         {
+#if defined EL_EWALD_ANY || defined EL_RF
             qi    = xqib[i * CL_SIZE + tidxi].w;
             E_el += qi*qi;
+#endif
+
+#if defined LJ_EWALD
+#ifdef USE_TEXOBJ
+            E_lj += tex1Dfetch<float>(nbparam.nbfp_texobj, atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2);
+#else
+            E_lj += tex1Dfetch(nbfp_texref, atom_types[(sci*NCL_PER_SUPERCL + i)*CL_SIZE + tidxi]*(ntypes + 1)*2);
+#endif /* USE_TEXOBJ */
+#endif /* LJ_EWALD */
+
         }
-        /* divide the self term equally over the j-threads */
+
+        /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+#ifdef LJ_EWALD
+        E_lj /= CL_SIZE;
+        E_lj *= 0.5f*ONE_SIXTH_F*lje_coeff6_6;
+#endif  /* LJ_EWALD */
+
+#if defined EL_EWALD_ANY || defined EL_RF
         E_el /= CL_SIZE;
 #ifdef EL_RF
         E_el *= -nbparam.epsfac*0.5f*c_rf;
 #else
         E_el *= -nbparam.epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 #endif
+#endif                                                 /* EL_EWALD_ANY || defined EL_RF */
     }
-#endif
-#endif
+#endif                                                 /* EXCLUSION_FORCES */
+
+#endif                                                 /* CALC_ENERGIES */
 
     /* skip central shifts when summing shift forces */
     if (nb_sci.shift == CENTRAL)
@@ -300,7 +344,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                             int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
 
                             /* cutoff & exclusion check */
-#if defined EL_EWALD_ANY || defined EL_RF
+#ifdef EXCLUSION_FORCES
                             if (r2 < rcoulomb_sq *
                                 (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
 #else
@@ -331,11 +375,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                                 inv_r   = rsqrt(r2);
                                 inv_r2  = inv_r * inv_r;
                                 inv_r6  = inv_r2 * inv_r2 * inv_r2;
-#if defined EL_EWALD_ANY || defined EL_RF
+#if defined EXCLUSION_FORCES
                                 /* We could mask inv_r2, but with Ewald
                                  * masking both inv_r6 and F_invr is faster */
                                 inv_r6  *= int_bit;
-#endif
+#endif                          /* EXCLUSION_FORCES */
 
                                 F_invr  = inv_r6 * (c12 * inv_r6 - c6) * inv_r2;
 #if defined CALC_ENERGIES || defined LJ_POT_SWITCH
@@ -351,6 +395,25 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif /* CALC_ENERGIES */
 #endif /* LJ_FORCE_SWITCH */
 
+
+#ifdef LJ_EWALD
+#ifdef LJ_EWALD_COMB_GEOM
+#ifdef CALC_ENERGIES
+                                calculate_lj_ewald_comb_geom_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, int_bit, &F_invr, &E_lj_p);
+#else
+                                calculate_lj_ewald_comb_geom_F(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6, &F_invr);
+#endif                          /* CALC_ENERGIES */
+#elif defined LJ_EWALD_COMB_LB
+                                calculate_lj_ewald_comb_LB_F_E(nbparam, typei, typej, r2, inv_r2, lje_coeff2, lje_coeff6_6,
+#ifdef CALC_ENERGIES
+                                                               int_bit, &F_invr, &E_lj_p
+#else
+                                                               0, &F_invr, NULL
+#endif /* CALC_ENERGIES */
+                                                               );
+#endif /* LJ_EWALD_COMB_GEOM */
+#endif /* LJ_EWALD */
+
 #ifdef VDW_CUTOFF_CHECK
                                 /* Separate VDW cut-off check to enable twin-range cut-offs
                                  * (rvdw < rcoulomb <= rlist)
@@ -486,3 +549,5 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 }
 
 #undef EL_EWALD_ANY
+#undef EXCLUSION_FORCES
+#undef LJ_EWALD
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
index 00b6d47115..7ea591db30 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -183,6 +183,123 @@ void calculate_potential_switch_F_E(const  cu_nbparam_t nbparam,
     *E_lj   *= sw;
 }
 
+/*! Calculate LJ-PME grid force contribution with
+ *  geometric combination rule.
+ */
+static inline __device__
+void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
+                                    int                typei,
+                                    int                typej,
+                                    float              r2,
+                                    float              inv_r2,
+                                    float              lje_coeff2,
+                                    float              lje_coeff6_6,
+                                    float             *F_invr)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+
+#ifdef USE_TEXOBJ
+    c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
+#else
+    c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
+#endif /* USE_TEXOBJ */
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = expf(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+}
+
+/*! Calculate LJ-PME grid force + energy contribution with
+ *  geometric combination rule.
+ */
+static inline __device__
+void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
+                                      int                typei,
+                                      int                typej,
+                                      float              r2,
+                                      float              inv_r2,
+                                      float              lje_coeff2,
+                                      float              lje_coeff6_6,
+                                      float              int_bit,
+                                      float             *F_invr,
+                                      float             *E_lj)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
+
+#ifdef USE_TEXOBJ
+    c6grid    = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
+#else
+    c6grid    = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
+#endif /* USE_TEXOBJ */
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = expf(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+    /* Shift should be applied only to real LJ pairs */
+    sh_mask   = nbparam.sh_lj_ewald*int_bit;
+    *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+}
+
+/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
+ *  Lorentz-Berthelot combination rule.
+ *  We use a single F+E kernel with conditional because the performance impact
+ *  of this is pretty small and LB on the CPU is anyway very slow.
+ */
+static inline __device__
+void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
+                                    int                typei,
+                                    int                typej,
+                                    float              r2,
+                                    float              inv_r2,
+                                    float              lje_coeff2,
+                                    float              lje_coeff6_6,
+                                    float              int_bit,
+                                    float             *F_invr,
+                                    float             *E_lj)
+{
+    float c6grid, inv_r6_nm, cr2, expmcr2, poly;
+    float sigma, sigma2, epsilon;
+
+    /* sigma and epsilon are scaled to give 6*C6 */
+#ifdef USE_TEXOBJ
+    sigma   = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei    ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej    );
+    epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
+#else
+    sigma   = tex1Dfetch(nbfp_comb_texref, 2*typei    ) + tex1Dfetch(nbfp_comb_texref, 2*typej    );
+    epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
+#endif /* USE_TEXOBJ */
+    sigma2  = sigma*sigma;
+    c6grid  = epsilon*sigma2*sigma2*sigma2;
+
+    /* Recalculate inv_r6 without exclusion mask */
+    inv_r6_nm = inv_r2*inv_r2*inv_r2;
+    cr2       = lje_coeff2*r2;
+    expmcr2   = expf(-cr2);
+    poly      = 1.0f + cr2 + 0.5f*cr2*cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
+
+    if (E_lj != NULL)
+    {
+        float sh_mask;
+
+        /* Shift should be applied only to real LJ pairs */
+        sh_mask   = nbparam.sh_lj_ewald*int_bit;
+        *E_lj    += ONE_SIXTH_F*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
+    }
+}
 
 /*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
  *  Original idea: from the OpenMM project
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
index 555ec57325..897b956a74 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernels.cuh
@@ -36,7 +36,8 @@
 /*! \internal \file
  *  This header has the sole purpose of generating kernels for the combinations of
  *  supported electrostatics types (cut-off, reaction-field, analytical and
- *  tabulated Ewald) and VDW types ( V shift, F switch, V swtich).
+ *  tabulated Ewald) and VDW types (cut-off + V shift, LJ-Ewald with
+ *  geometric or Lorentz-Berthelot combination rule, F switch, V switch).
  *
  *  The Ewald kernels have twin-range cut-off versions with rcoul != rvdw which
  *  require an extra distance check to enable  PP-PME load balancing
@@ -49,17 +50,29 @@
  */
 #define EL_CUTOFF
 
-/* V shift */
+/* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJFsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecCut_VdwLJPsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
@@ -68,21 +81,34 @@
 
 #undef EL_CUTOFF
 
+
 /* Analytical reaction-field kernels
  */
 #define EL_RF
 
-/* V shift */
+/* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJFsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecRF_VdwLJPsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
@@ -96,17 +122,29 @@
  */
 #define EL_EWALD_ANA
 
-/* V shift */
+/* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJFsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEw_VdwLJPsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
@@ -116,23 +154,34 @@
 #undef EL_EWALD_ANA
 
 
-
 /* Analytical Ewald interaction kernels with twin-range cut-off
  */
 #define EL_EWALD_ANA
 #define LJ_CUTOFF_CHECK
 
-/* V shift */
+/* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJFsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwTwinCut_VdwLJPsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
@@ -143,21 +192,32 @@
 #undef LJ_CUTOFF_CHECK
 
 
-
 /* Tabulated Ewald interaction kernels */
 #define EL_EWALD_TAB
 
-/* V shift */
+/* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJFsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTab_VdwLJPsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
@@ -171,17 +231,29 @@
 #define EL_EWALD_TAB
 #define LJ_CUTOFF_CHECK
 
-/* V shift */
+/* cut-off + V shift LJ */
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJ ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef NB_KERNEL_FUNC_NAME
-/* F switch */
+/* LJ-Ewald w geometric combination rules */
+#define LJ_EWALD_COMB_GEOM
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombGeom ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_GEOM
+#undef NB_KERNEL_FUNC_NAME
+/* LJ-Ewald w LB combination rules */
+#define LJ_EWALD_COMB_LB
+#define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJEwCombLB ## __VA_ARGS__
+#include "nbnxn_cuda_kernel.cuh"
+#undef LJ_EWALD_COMB_LB
+#undef NB_KERNEL_FUNC_NAME
+/* F switch LJ */
 #define LJ_FORCE_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJFsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
 #undef LJ_FORCE_SWITCH
 #undef NB_KERNEL_FUNC_NAME
-/* V switch */
+/* V switch LJ */
 #define LJ_POT_SWITCH
 #define NB_KERNEL_FUNC_NAME(x, ...) x ## _ElecEwQSTabTwinCut_VdwLJPsw ## __VA_ARGS__
 #include "nbnxn_cuda_kernel.cuh"
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
index 80d75cba99..9c7192c6b4 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_types.h
@@ -91,7 +91,7 @@ enum eelCu {
  * should match the order of enumerated types below.
  */
 enum evdwCu {
-    evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuNR
+    evdwCuCUT, evdwCuFSWITCH, evdwCuPSWITCH, evdwCuEWALDGEOM, evdwCuEWALDLB, evdwCuNR
 };
 
 /* All structs prefixed with "cu_" hold data used in GPU calculations and
@@ -155,7 +155,10 @@ struct cu_nbparam
     float           c_rf;             /**< Reaction-field/plain cutoff electrostatics const. */
     float           two_k_rf;         /**< Reaction-field electrostatics constant            */
     float           ewald_beta;       /**< Ewald/PME parameter                               */
-    float           sh_ewald;         /**< Ewald/PME  correction term                        */
+    float           sh_ewald;         /**< Ewald/PME correction term substracted from the direct-space potential */
+    float           sh_lj_ewald;      /**< LJ-Ewald/PME correction term added to the correction potential        */
+    float           ewaldcoeff_lj;    /**< LJ-Ewald/PME coefficient                          */
+
     float           rcoulomb_sq;      /**< Coulomb cut-off squared                           */
 
     float           rvdw_sq;          /**< VdW cut-off squared                               */
@@ -166,9 +169,11 @@ struct cu_nbparam
     shift_consts_t  repulsion_shift;  /**< VdW shift repulsion constants            */
     switch_consts_t vdw_switch;       /**< VdW switch constants                     */
 
-    /* Non-bonded parameters - accessed through texture memory */
-    float              *nbfp;        /**< nonbonded parameter table with C6/C12 pairs  */
-    cudaTextureObject_t nbfp_texobj; /**< texture object bound to nbfp                 */
+    /* LJ non-bonded parameters - accessed through texture memory */
+    float               *nbfp;             /**< nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements */
+    cudaTextureObject_t  nbfp_texobj;      /**< texture object bound to nbfp                                                       */
+    float               *nbfp_comb;        /**< nonbonded parameter table per atom type, 2*ntype elements                          */
+    cudaTextureObject_t  nbfp_comb_texobj; /**< texture object bound to nbfp_texobj                                                */
 
     /* Ewald Coulomb force table data - accessed through texture memory */
     int                  coulomb_tab_size;   /**< table size (s.t. it fits in texture cache) */