From: Szilárd Páll <pall.szilard@gmail.com>
Date: Thu, 4 Mar 2021 08:08:51 +0000 (+0000)
Subject:  Simplify LJ parameter lookup
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=d440c46ef06c3a4c1695e3ec37e7e108ab7a2817;p=alexxy%2Fgromacs.git

 Simplify LJ parameter lookup

  - Unify the CUDA per-pair scaled C6/C12 LJ parameter
    loading codepaths so both texture and LDG paths do
    8-byte float2 loads.
  - Unify the CUDA per-atom LJ parameter loading codepaths
    so both texture and LDG paths do 8-byte float2 loads.
  - Simplify the LJ self interaction energy term calculation
    by only using the non-texture loads path to simplify code.
    As this is outside the main loop, its impact is negligible.
  - Use float2 loads for the above in OpenCL as well.
---

diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index f4467c0248..692f4076a6 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -151,15 +151,26 @@ static void init_nbparam(NBParamGpu*                     nbp,
     /* set up LJ parameter lookup table */
     if (!useLjCombRule(nbp->vdwType))
     {
-        initParamLookupTable(
-                &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * ntypes * ntypes, deviceContext);
+        static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp,
+                             &nbp->nbfp_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+                             ntypes * ntypes,
+                             deviceContext);
     }
 
     /* set up LJ-PME parameter lookup table */
     if (ic->vdwtype == VanDerWaalsType::Pme)
     {
-        initParamLookupTable(
-                &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * ntypes, deviceContext);
+        static_assert(sizeof(decltype(nbp->nbfp_comb))
+                              == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp_comb,
+                             &nbp->nbfp_comb_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+                             ntypes,
+                             deviceContext);
     }
 }
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
index 344e971c84..6f90d00694 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
@@ -336,15 +336,9 @@ __launch_bounds__(THREADS_PER_BLOCK)
 #            endif
 
 #            ifdef LJ_EWALD
-#                if DISABLE_CUDA_TEXTURES
-            E_lj += LDG(&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
-                                      * (ntypes + 1) * 2]);
-#                else
-            E_lj += tex1Dfetch<float>(
-                    nbparam.nbfp_texobj,
-                    atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
-                            * (ntypes + 1) * 2);
-#                endif
+            // load only the first 4 bytes of the parameter pair (equivalent with nbfp[idx].x)
+            E_lj += LDG((float*)&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                                              * (ntypes + 1)]);
 #            endif
         }
 
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
index 4850298f8f..b916997a22 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -204,11 +204,13 @@ static __forceinline__ __device__ void
 static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const NBParamGpu nbparam, int typei, int typej)
 {
 #    if DISABLE_CUDA_TEXTURES
-    return LDG(&nbparam.nbfp_comb[2 * typei]) * LDG(&nbparam.nbfp_comb[2 * typej]);
+    float c6_i = LDG(&nbparam.nbfp_comb[typei]).x;
+    float c6_j = LDG(&nbparam.nbfp_comb[typej]).x;
 #    else
-    return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typei)
-           * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typej);
+    float c6_i = tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, typei).x;
+    float c6_j = tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, typej).x;
 #    endif /* DISABLE_CUDA_TEXTURES */
+    return c6_i * c6_j;
 }
 
 
@@ -278,19 +280,11 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const NB
  */
 static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const NBParamGpu nbparam, int type)
 {
-    float2 c6c12;
 #    if DISABLE_CUDA_TEXTURES
-    /* Force an 8-byte fetch to save a memory instruction. */
-    float2* nbfp_comb = (float2*)nbparam.nbfp_comb;
-    c6c12             = LDG(&nbfp_comb[type]);
+    return LDG(&nbparam.nbfp_comb[type]);
 #    else
-    /* NOTE: as we always do 8-byte aligned loads, we could
-       fetch float2 here too just as above. */
-    c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type);
-    c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type + 1);
+    return tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, type);
 #    endif /* DISABLE_CUDA_TEXTURES */
-
-    return c6c12;
 }
 
 
@@ -357,8 +351,8 @@ static __forceinline__ __device__ float2 fetch_coulomb_force_r(const NBParamGpu
     d.x = LDG(&nbparam.coulomb_tab[index]);
     d.y = LDG(&nbparam.coulomb_tab[index + 1]);
 #    else
-    d.x     = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
-    d.y     = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
+    d.x   = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
+    d.y   = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
 #    endif // DISABLE_CUDA_TEXTURES
 
     return d;
@@ -397,19 +391,14 @@ static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBPara
  */
 static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex)
 {
+    float2 c6c12;
 #    if DISABLE_CUDA_TEXTURES
-    /* Force an 8-byte fetch to save a memory instruction. */
-    float2* nbfp = (float2*)nbparam.nbfp;
-    float2  c6c12;
-    c6c12 = LDG(&nbfp[baseIndex]);
-    c6    = c6c12.x;
-    c12   = c6c12.y;
+    c6c12 = LDG(&nbparam.nbfp[baseIndex]);
 #    else
-    /* NOTE: as we always do 8-byte aligned loads, we could
-       fetch float2 here too just as above. */
-    c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex);
-    c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex + 1);
+    c6c12 = tex1Dfetch<float2>(nbparam.nbfp_texobj, baseIndex);
 #    endif // DISABLE_CUDA_TEXTURES
+    c6  = c6c12.x;
+    c12 = c6c12.y;
 }
 
 
diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h
index 4b7f0e762e..ee6357e3f6 100644
--- a/src/gromacs/nbnxm/gpu_types_common.h
+++ b/src/gromacs/nbnxm/gpu_types_common.h
@@ -175,12 +175,12 @@ struct NBParamGpu
     switch_consts_t vdw_switch;
 
     /* LJ non-bonded parameters - accessed through texture memory */
-    //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements
-    DeviceBuffer<float> nbfp;
+    //! nonbonded parameter table with 6*C6/12*C12 pairs per atom type-pair, ntype^2 elements
+    DeviceBuffer<Float2> nbfp;
     //! texture object bound to nbfp
     DeviceTexture nbfp_texobj;
-    //! nonbonded parameter table per atom type, 2*ntype elements
-    DeviceBuffer<float> nbfp_comb;
+    //! nonbonded parameter table per atom type, ntype elements
+    DeviceBuffer<Float2> nbfp_comb;
     //! texture object bound to nbfp_comb
     DeviceTexture nbfp_comb_texobj;
 
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index a2ee20416f..0583d0a73d 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -163,19 +163,28 @@ static void init_nbparam(NBParamGpu*                     nbp,
         allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
     }
 
-    const int nnbfp      = 2 * nbatParams.numTypes * nbatParams.numTypes;
-    const int nnbfp_comb = 2 * nbatParams.numTypes;
-
     {
         /* set up LJ parameter lookup table */
-        DeviceBuffer<real> nbfp;
-        initParamLookupTable(&nbfp, nullptr, nbatParams.nbfp.data(), nnbfp, deviceContext);
+        static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+                      "Mismatch in the size of host / device data types");
+        DeviceBuffer<Float2> nbfp;
+        initParamLookupTable(&nbfp,
+                             nullptr,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+                             nbatParams.numTypes * nbatParams.numTypes,
+                             deviceContext);
         nbp->nbfp = nbfp;
 
         if (ic->vdwtype == VanDerWaalsType::Pme)
         {
-            DeviceBuffer<float> nbfp_comb;
-            initParamLookupTable(&nbfp_comb, nullptr, nbatParams.nbfp_comb.data(), nnbfp_comb, deviceContext);
+            static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+                          "Mismatch in the size of host / device data types");
+            DeviceBuffer<Float2> nbfp_comb;
+            initParamLookupTable(&nbfp_comb,
+                                 nullptr,
+                                 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+                                 nbatParams.numTypes,
+                                 deviceContext);
             nbp->nbfp_comb = nbfp_comb;
         }
     }
@@ -482,8 +491,9 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     if (useLjCombRule(nb->nbparam->vdwType))
     {
-        static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
-                      "Size of the LJ parameters element should be equal to the size of float2.");
+        static_assert(
+                sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
+                "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
         copyToDeviceBuffer(&d_atdat->ljComb,
                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
                            0,
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
index 7a55ca8e2d..e51957eae6 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
@@ -126,11 +126,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #else
                 const __global int* restrict atom_types, /* IN  */
 #endif
-                const __global float* restrict shift_vec,       /* IN stores float3 values */
-                __constant const float* gmx_unused nbfp,        /* IN  */
-                __constant const float* gmx_unused nbfp_comb,   /* IN  */
-                __constant const float* gmx_unused coulomb_tab, /* IN  */
-                const __global nbnxn_sci_t* pl_sci,             /* IN  */
+                const __global float* restrict shift_vec,          /* IN stores float3 values */
+                __constant const float2* restrict gmx_unused nbfp, /* IN  */
+                __constant const float2* restrict gmx_unused nbfp_comb,  /* IN  */
+                __constant const float* restrict gmx_unused coulomb_tab, /* IN  */
+                const __global nbnxn_sci_t* pl_sci,                      /* IN  */
 #ifndef PRUNE_NBL
                 const
 #endif
@@ -286,8 +286,8 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
             E_el += qi * qi;
 #        endif
 #        if defined LJ_EWALD
-            E_lj += nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi]
-                         * (ntypes + 1) * 2];
+            E_lj += nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi] * (ntypes + 1)]
+                            .x;
 #        endif /* LJ_EWALD */
         }
 
@@ -411,8 +411,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #endif                          /* IATYPE_SHMEM */
                                 /* LJ 6*C6 and 12*C12 */
 #ifndef LJ_COMB
-                                const float c6  = nbfp[2 * (ntypes * typei + typej)];
-                                const float c12 = nbfp[2 * (ntypes * typei + typej) + 1];
+                                const float2 c6c12 = nbfp[ntypes * typei + typej];
+
+                                const float c6  = c6c12.x;
+                                const float c12 = c6c12.y;
 #else /* LJ_COMB */
 #    ifdef LJ_COMB_GEOM
                                 const float c6     = ljcp_i.x * ljcp_j.x;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
index b3cb444857..fa942547ba 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
@@ -456,19 +456,28 @@ gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t*
     *E_lj *= sw;
 }
 
+/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
+ */
+gmx_opencl_inline float calculate_lj_ewald_c6grid(__constant const float2* nbfp_comb, int typei, int typej)
+{
+    const float c6_i = nbfp_comb[typei].x;
+    const float c6_j = nbfp_comb[typej].x;
+    return c6_i * c6_j;
+}
+
 /*! Calculate LJ-PME grid force contribution with
  *  geometric combination rule.
  */
-gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nbfp_comb,
-                                                      int                     typei,
-                                                      int                     typej,
-                                                      float                   r2,
-                                                      float                   inv_r2,
-                                                      float                   lje_coeff2,
-                                                      float                   lje_coeff6_6,
-                                                      float*                  F_invr)
+gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float2* nbfp_comb,
+                                                      int                      typei,
+                                                      int                      typej,
+                                                      float                    r2,
+                                                      float                    inv_r2,
+                                                      float                    lje_coeff2,
+                                                      float                    lje_coeff6_6,
+                                                      float*                   F_invr)
 {
-    const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej];
+    const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);
 
     /* Recalculate inv_r6 without exclusion mask */
     const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
@@ -483,7 +492,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nb
 /*! Calculate LJ-PME grid force + energy contribution with
  *  geometric combination rule.
  */
-gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float*    nbfp_comb,
+gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float2*   nbfp_comb,
                                                         const cl_nbparam_params_t* nbparam,
                                                         int                        typei,
                                                         int                        typej,
@@ -495,7 +504,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float*
                                                         float*                     F_invr,
                                                         float*                     E_lj)
 {
-    const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej];
+    const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);
 
     /* Recalculate inv_r6 without exclusion mask */
     const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
@@ -516,7 +525,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float*
  *  We use a single F+E kernel with conditional because the performance impact
  *  of this is pretty small and LB on the CPU is anyway very slow.
  */
-gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*    nbfp_comb,
+gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float2*   nbfp_comb,
                                                       const cl_nbparam_params_t* nbparam,
                                                       int                        typei,
                                                       int                        typej,
@@ -530,9 +539,10 @@ gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*
                                                       float*                     E_lj)
 {
     /* sigma and epsilon are scaled to give 6*C6 */
-    const float sigma = nbfp_comb[2 * typei] + nbfp_comb[2 * typej];
-
-    const float epsilon = nbfp_comb[2 * typei + 1] * nbfp_comb[2 * typej + 1];
+    const float2 c6c12_i = nbfp_comb[typei];
+    const float2 c6c12_j = nbfp_comb[typej];
+    const float  sigma   = c6c12_i.x + c6c12_j.x;
+    const float  epsilon = c6c12_i.y + c6c12_j.y;
 
     const float sigma2 = sigma * sigma;
     const float c6grid = epsilon * sigma2 * sigma2 * sigma2;
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
index 0f67f04f0a..b7ad72a6a0 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
@@ -124,15 +124,26 @@ static void initNbparam(NBParamGpu*                     nbp,
     /* set up LJ parameter lookup table */
     if (!useLjCombRule(nbp->vdwType))
     {
-        initParamLookupTable(
-                &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * numTypes * numTypes, deviceContext);
+        static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp,
+                             &nbp->nbfp_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+                             numTypes * numTypes,
+                             deviceContext);
     }
 
     /* set up LJ-PME parameter lookup table */
     if (ic.vdwtype == VanDerWaalsType::Pme)
     {
-        initParamLookupTable(
-                &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * numTypes, deviceContext);
+        static_assert(sizeof(decltype(nbp->nbfp_comb))
+                              == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp_comb,
+                             &nbp->nbfp_comb_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+                             numTypes,
+                             deviceContext);
     }
 }
 
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
index a372afa775..504557d563 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -100,15 +100,14 @@ using cl::sycl::access::fence_space;
 using cl::sycl::access::mode;
 using cl::sycl::access::target;
 
-static inline void convertSigmaEpsilonToC6C12(const float                  sigma,
-                                              const float                  epsilon,
-                                              cl::sycl::private_ptr<float> c6,
-                                              cl::sycl::private_ptr<float> c12)
+static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float epsilon)
 {
     const float sigma2 = sigma * sigma;
     const float sigma6 = sigma2 * sigma2 * sigma2;
-    *c6                = epsilon * sigma6;
-    *c12               = (*c6) * sigma6;
+    const float c6     = epsilon * sigma6;
+    const float c12    = c6 * sigma6;
+
+    return Float2(c6, c12);
 }
 
 template<bool doCalcEnergies>
@@ -147,25 +146,23 @@ static inline void ljForceSwitch(const shift_consts_t         dispersionShift,
 
 //! \brief Fetch C6 grid contribution coefficients and return the product of these.
 template<enum VdwType vdwType>
-static inline float calculateLJEwaldC6Grid(const DeviceAccessor<float, mode::read> a_nbfpComb,
-                                           const int                               typeI,
-                                           const int                               typeJ)
+static inline float calculateLJEwaldC6Grid(const DeviceAccessor<Float2, mode::read> a_nbfpComb,
+                                           const int                                typeI,
+                                           const int                                typeJ)
 {
     if constexpr (vdwType == VdwType::EwaldGeom)
     {
-        return a_nbfpComb[2 * typeI] * a_nbfpComb[2 * typeJ];
+        return a_nbfpComb[typeI][0] * a_nbfpComb[typeJ][0];
     }
     else
     {
         static_assert(vdwType == VdwType::EwaldLB);
         /* sigma and epsilon are scaled to give 6*C6 */
-        const float c6_i  = a_nbfpComb[2 * typeI];
-        const float c12_i = a_nbfpComb[2 * typeI + 1];
-        const float c6_j  = a_nbfpComb[2 * typeJ];
-        const float c12_j = a_nbfpComb[2 * typeJ + 1];
+        const Float2 c6c12_i = a_nbfpComb[typeI];
+        const Float2 c6c12_j = a_nbfpComb[typeJ];
 
-        const float sigma   = c6_i + c6_j;
-        const float epsilon = c12_i * c12_j;
+        const float sigma   = c6c12_i[0] + c6c12_j[0];
+        const float epsilon = c6c12_i[1] * c6c12_j[1];
 
         const float sigma2 = sigma * sigma;
         return epsilon * sigma2 * sigma2 * sigma2;
@@ -174,17 +171,17 @@ static inline float calculateLJEwaldC6Grid(const DeviceAccessor<float, mode::rea
 
 //! Calculate LJ-PME grid force contribution with geometric or LB combination rule.
 template<bool doCalcEnergies, enum VdwType vdwType>
-static inline void ljEwaldComb(const DeviceAccessor<float, mode::read> a_nbfpComb,
-                               const float                             sh_lj_ewald,
-                               const int                               typeI,
-                               const int                               typeJ,
-                               const float                             r2,
-                               const float                             r2Inv,
-                               const float                             lje_coeff2,
-                               const float                             lje_coeff6_6,
-                               const float                             int_bit,
-                               cl::sycl::private_ptr<float>            fInvR,
-                               cl::sycl::private_ptr<float>            eLJ)
+static inline void ljEwaldComb(const DeviceAccessor<Float2, mode::read> a_nbfpComb,
+                               const float                              sh_lj_ewald,
+                               const int                                typeI,
+                               const int                                typeJ,
+                               const float                              r2,
+                               const float                              r2Inv,
+                               const float                              lje_coeff2,
+                               const float                              lje_coeff6_6,
+                               const float                              int_bit,
+                               cl::sycl::private_ptr<float>             fInvR,
+                               cl::sycl::private_ptr<float>             eLJ)
 {
     const float c6grid = calculateLJEwaldC6Grid<vdwType>(a_nbfpComb, typeI, typeJ);
 
@@ -429,8 +426,8 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                  DeviceAccessor<nbnxn_excl_t, mode::read>                    a_plistExcl,
                  OptionalAccessor<Float2, mode::read, ljComb<vdwType>>       a_ljComb,
                  OptionalAccessor<int, mode::read, !ljComb<vdwType>>         a_atomTypes,
-                 OptionalAccessor<float, mode::read, !ljComb<vdwType>>       a_nbfp,
-                 OptionalAccessor<float, mode::read, ljEwald<vdwType>>       a_nbfpComb,
+                 OptionalAccessor<Float2, mode::read, !ljComb<vdwType>>      a_nbfp,
+                 OptionalAccessor<Float2, mode::read, ljEwald<vdwType>>      a_nbfpComb,
                  OptionalAccessor<float, mode::read, elecEwaldTab<elecType>> a_coulombTab,
                  const int                                                   numTypes,
                  const float                                                 rCoulombSq,
@@ -609,7 +606,7 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                     {
                         energyVdw +=
                                 a_nbfp[a_atomTypes[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
-                                       * (numTypes + 1) * 2];
+                                       * (numTypes + 1)][0];
                     }
                 }
                 /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
@@ -712,23 +709,21 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                         {
                             const float qi = xqi[3];
                             int         atomTypeI; // Only needed if (!props.vdwComb)
-                            float       c6, c12, sigma, epsilon;
+                            float       sigma, epsilon;
+                            Float2      c6c12;
 
                             if constexpr (!props.vdwComb)
                             {
                                 /* LJ 6*C6 and 12*C12 */
-                                atomTypeI     = sm_atomTypeI[i][tidxi];
-                                const int idx = (numTypes * atomTypeI + atomTypeJ) * 2;
-                                c6            = a_nbfp[idx]; // TODO: Make a_nbfm into float2
-                                c12           = a_nbfp[idx + 1];
+                                atomTypeI = sm_atomTypeI[i][tidxi];
+                                c6c12     = a_nbfp[numTypes * atomTypeI + atomTypeJ];
                             }
                             else
                             {
                                 const Float2 ljCombI = sm_ljCombI[i][tidxi];
                                 if constexpr (props.vdwCombGeom)
                                 {
-                                    c6  = ljCombI[0] * ljCombJ[0];
-                                    c12 = ljCombI[1] * ljCombJ[1];
+                                    c6c12 = Float2(ljCombI[0] * ljCombJ[0], ljCombI[1] * ljCombJ[1]);
                                 }
                                 else
                                 {
@@ -738,11 +733,15 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                                     epsilon = ljCombI[1] * ljCombJ[1];
                                     if constexpr (doCalcEnergies)
                                     {
-                                        convertSigmaEpsilonToC6C12(sigma, epsilon, &c6, &c12);
+                                        c6c12 = convertSigmaEpsilonToC6C12(sigma, epsilon);
                                     }
                                 } // props.vdwCombGeom
                             }     // !props.vdwComb
 
+                            // c6 and c12 are unused and garbage iff props.vdwCombLB && !doCalcEnergies
+                            const float c6  = c6c12[0];
+                            const float c12 = c6c12[1];
+
                             // Ensure distance do not become so small that r^-12 overflows
                             r2 = std::max(r2, c_nbnxnMinDistanceSquared);
 #if GMX_SYCL_HIPSYCL