Simplify LJ parameter lookup

author Szilárd Páll <pall.szilard@gmail.com>

Thu, 4 Mar 2021 08:08:51 +0000 (08:08 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 4 Mar 2021 08:08:51 +0000 (08:08 +0000)
author Szilárd Páll <pall.szilard@gmail.com>
Thu, 4 Mar 2021 08:08:51 +0000 (08:08 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 4 Mar 2021 08:08:51 +0000 (08:08 +0000)
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu

index f4467c0248a0149f98dc38b33e6e3fdaaa056d9c..692f4076a6faa58b06802b44c884812012d6e146 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -151,15 +151,26 @@ static void init_nbparam(NBParamGpu*                     nbp,
      /* set up LJ parameter lookup table */
      if (!useLjCombRule(nbp->vdwType))
      {
-        initParamLookupTable(
-                &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * ntypes * ntypes, deviceContext);
+        static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp,
+                             &nbp->nbfp_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+                             ntypes * ntypes,
+                             deviceContext);
      }
  
      /* set up LJ-PME parameter lookup table */
      if (ic->vdwtype == VanDerWaalsType::Pme)
      {
-        initParamLookupTable(
-                &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * ntypes, deviceContext);
+        static_assert(sizeof(decltype(nbp->nbfp_comb))
+                              == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp_comb,
+                             &nbp->nbfp_comb_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+                             ntypes,
+                             deviceContext);
      }
  }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh

index 344e971c845afeb7e5a25abb65e7af8e3dcf8b32..6f90d0069425dc0fffb5fbe8c3a2f88083c4069a 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
@@ -336,15 +336,9 @@ __launch_bounds__(THREADS_PER_BLOCK)
  #            endif
  
  #            ifdef LJ_EWALD
-#                if DISABLE_CUDA_TEXTURES
-            E_lj += LDG(&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
-                                      * (ntypes + 1) * 2]);
-#                else
-            E_lj += tex1Dfetch<float>(
-                    nbparam.nbfp_texobj,
-                    atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
-                            * (ntypes + 1) * 2);
-#                endif
+            // load only the first 4 bytes of the parameter pair (equivalent with nbfp[idx].x)
+            E_lj += LDG((float*)&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                                              * (ntypes + 1)]);
  #            endif
          }
  
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh

index 4850298f8f154082b8b19c2a395aab938fe8d59c..b916997a22c6097663814395762edb474e384e65 100644 (file)
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -204,11 +204,13 @@ static __forceinline__ __device__ void
  static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const NBParamGpu nbparam, int typei, int typej)
  {
  #    if DISABLE_CUDA_TEXTURES
-    return LDG(&nbparam.nbfp_comb[2 * typei]) * LDG(&nbparam.nbfp_comb[2 * typej]);
+    float c6_i = LDG(&nbparam.nbfp_comb[typei]).x;
+    float c6_j = LDG(&nbparam.nbfp_comb[typej]).x;
  #    else
-    return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typei)
-           * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typej);
+    float c6_i = tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, typei).x;
+    float c6_j = tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, typej).x;
  #    endif /* DISABLE_CUDA_TEXTURES */
+    return c6_i * c6_j;
  }
  
  
@@ -278,19 +280,11 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const NB
   */
  static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const NBParamGpu nbparam, int type)
  {
-    float2 c6c12;
  #    if DISABLE_CUDA_TEXTURES
-    /* Force an 8-byte fetch to save a memory instruction. */
-    float2* nbfp_comb = (float2*)nbparam.nbfp_comb;
-    c6c12             = LDG(&nbfp_comb[type]);
+    return LDG(&nbparam.nbfp_comb[type]);
  #    else
-    /* NOTE: as we always do 8-byte aligned loads, we could
-       fetch float2 here too just as above. */
-    c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type);
-    c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type + 1);
+    return tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, type);
  #    endif /* DISABLE_CUDA_TEXTURES */
-
-    return c6c12;
  }
  
  
@@ -357,8 +351,8 @@ static __forceinline__ __device__ float2 fetch_coulomb_force_r(const NBParamGpu
      d.x = LDG(&nbparam.coulomb_tab[index]);
      d.y = LDG(&nbparam.coulomb_tab[index + 1]);
  #    else
-    d.x     = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
-    d.y     = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
+    d.x   = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
+    d.y   = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
  #    endif // DISABLE_CUDA_TEXTURES
  
      return d;
@@ -397,19 +391,14 @@ static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBPara
   */
  static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex)
  {
+    float2 c6c12;
  #    if DISABLE_CUDA_TEXTURES
-    /* Force an 8-byte fetch to save a memory instruction. */
-    float2* nbfp = (float2*)nbparam.nbfp;
-    float2  c6c12;
-    c6c12 = LDG(&nbfp[baseIndex]);
-    c6    = c6c12.x;
-    c12   = c6c12.y;
+    c6c12 = LDG(&nbparam.nbfp[baseIndex]);
  #    else
-    /* NOTE: as we always do 8-byte aligned loads, we could
-       fetch float2 here too just as above. */
-    c6  = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex);
-    c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex + 1);
+    c6c12 = tex1Dfetch<float2>(nbparam.nbfp_texobj, baseIndex);
  #    endif // DISABLE_CUDA_TEXTURES
+    c6  = c6c12.x;
+    c12 = c6c12.y;
  }
  
  
diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h

index 4b7f0e762e8f079dc878eecbbd3315b5ed1e0b0c..ee6357e3f637dbe5b6c4c524290bbd6501fe1f14 100644 (file)
--- a/src/gromacs/nbnxm/gpu_types_common.h
+++ b/src/gromacs/nbnxm/gpu_types_common.h
@@ -175,12 +175,12 @@ struct NBParamGpu
      switch_consts_t vdw_switch;
  
      /* LJ non-bonded parameters - accessed through texture memory */
-    //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements
-    DeviceBuffer<float> nbfp;
+    //! nonbonded parameter table with 6*C6/12*C12 pairs per atom type-pair, ntype^2 elements
+    DeviceBuffer<Float2> nbfp;
      //! texture object bound to nbfp
      DeviceTexture nbfp_texobj;
-    //! nonbonded parameter table per atom type, 2*ntype elements
-    DeviceBuffer<float> nbfp_comb;
+    //! nonbonded parameter table per atom type, ntype elements
+    DeviceBuffer<Float2> nbfp_comb;
      //! texture object bound to nbfp_comb
      DeviceTexture nbfp_comb_texobj;
  
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp

index a2ee20416f4684cfd6468e906a6380466c853a7b..0583d0a73d7d6896aa7ebacf2febbfeaea4b9717 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -163,19 +163,28 @@ static void init_nbparam(NBParamGpu*                     nbp,
          allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
      }
  
-    const int nnbfp      = 2 * nbatParams.numTypes * nbatParams.numTypes;
-    const int nnbfp_comb = 2 * nbatParams.numTypes;
-
      {
          /* set up LJ parameter lookup table */
-        DeviceBuffer<real> nbfp;
-        initParamLookupTable(&nbfp, nullptr, nbatParams.nbfp.data(), nnbfp, deviceContext);
+        static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+                      "Mismatch in the size of host / device data types");
+        DeviceBuffer<Float2> nbfp;
+        initParamLookupTable(&nbfp,
+                             nullptr,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+                             nbatParams.numTypes * nbatParams.numTypes,
+                             deviceContext);
          nbp->nbfp = nbfp;
  
          if (ic->vdwtype == VanDerWaalsType::Pme)
          {
-            DeviceBuffer<float> nbfp_comb;
-            initParamLookupTable(&nbfp_comb, nullptr, nbatParams.nbfp_comb.data(), nnbfp_comb, deviceContext);
+            static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+                          "Mismatch in the size of host / device data types");
+            DeviceBuffer<Float2> nbfp_comb;
+            initParamLookupTable(&nbfp_comb,
+                                 nullptr,
+                                 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+                                 nbatParams.numTypes,
+                                 deviceContext);
              nbp->nbfp_comb = nbfp_comb;
          }
      }
@@ -482,8 +491,9 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  
      if (useLjCombRule(nb->nbparam->vdwType))
      {
-        static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
-                      "Size of the LJ parameters element should be equal to the size of float2.");
+        static_assert(
+                sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
+                "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
          copyToDeviceBuffer(&d_atdat->ljComb,
                             reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
                             0,
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh

index 7a55ca8e2d53cb52b3f24763adff0940ee75682c..e51957eae6dd1b2e436fa3d77010431baee39570 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh
@@ -126,11 +126,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #else
                  const __global int* restrict atom_types, /* IN  */
  #endif
-                const __global float* restrict shift_vec,       /* IN stores float3 values */
-                __constant const float* gmx_unused nbfp,        /* IN  */
-                __constant const float* gmx_unused nbfp_comb,   /* IN  */
-                __constant const float* gmx_unused coulomb_tab, /* IN  */
-                const __global nbnxn_sci_t* pl_sci,             /* IN  */
+                const __global float* restrict shift_vec,          /* IN stores float3 values */
+                __constant const float2* restrict gmx_unused nbfp, /* IN  */
+                __constant const float2* restrict gmx_unused nbfp_comb,  /* IN  */
+                __constant const float* restrict gmx_unused coulomb_tab, /* IN  */
+                const __global nbnxn_sci_t* pl_sci,                      /* IN  */
  #ifndef PRUNE_NBL
                  const
  #endif
@@ -286,8 +286,8 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
              E_el += qi * qi;
  #        endif
  #        if defined LJ_EWALD
-            E_lj += nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi]
-                         * (ntypes + 1) * 2];
+            E_lj += nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi] * (ntypes + 1)]
+                            .x;
  #        endif /* LJ_EWALD */
          }
  
@@ -411,8 +411,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif                          /* IATYPE_SHMEM */
                                  /* LJ 6*C6 and 12*C12 */
  #ifndef LJ_COMB
-                                const float c6  = nbfp[2 * (ntypes * typei + typej)];
-                                const float c12 = nbfp[2 * (ntypes * typei + typej) + 1];
+                                const float2 c6c12 = nbfp[ntypes * typei + typej];
+
+                                const float c6  = c6c12.x;
+                                const float c12 = c6c12.y;
  #else /* LJ_COMB */
  #    ifdef LJ_COMB_GEOM
                                  const float c6     = ljcp_i.x * ljcp_j.x;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh

index b3cb444857728648869d0ba7d3d8165b884be965..fa942547bad08a23d71d9c5a11e2fee093e5eb12 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh
@@ -456,19 +456,28 @@ gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t*
      *E_lj *= sw;
  }
  
+/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
+ */
+gmx_opencl_inline float calculate_lj_ewald_c6grid(__constant const float2* nbfp_comb, int typei, int typej)
+{
+    const float c6_i = nbfp_comb[typei].x;
+    const float c6_j = nbfp_comb[typej].x;
+    return c6_i * c6_j;
+}
+
  /*! Calculate LJ-PME grid force contribution with
   *  geometric combination rule.
   */
-gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nbfp_comb,
-                                                      int                     typei,
-                                                      int                     typej,
-                                                      float                   r2,
-                                                      float                   inv_r2,
-                                                      float                   lje_coeff2,
-                                                      float                   lje_coeff6_6,
-                                                      float*                  F_invr)
+gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float2* nbfp_comb,
+                                                      int                      typei,
+                                                      int                      typej,
+                                                      float                    r2,
+                                                      float                    inv_r2,
+                                                      float                    lje_coeff2,
+                                                      float                    lje_coeff6_6,
+                                                      float*                   F_invr)
  {
-    const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej];
+    const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);
  
      /* Recalculate inv_r6 without exclusion mask */
      const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
@@ -483,7 +492,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nb
  /*! Calculate LJ-PME grid force + energy contribution with
   *  geometric combination rule.
   */
-gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float*    nbfp_comb,
+gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float2*   nbfp_comb,
                                                          const cl_nbparam_params_t* nbparam,
                                                          int                        typei,
                                                          int                        typej,
@@ -495,7 +504,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float*
                                                          float*                     F_invr,
                                                          float*                     E_lj)
  {
-    const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej];
+    const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);
  
      /* Recalculate inv_r6 without exclusion mask */
      const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
@@ -516,7 +525,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float*
   *  We use a single F+E kernel with conditional because the performance impact
   *  of this is pretty small and LB on the CPU is anyway very slow.
   */
-gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*    nbfp_comb,
+gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float2*   nbfp_comb,
                                                        const cl_nbparam_params_t* nbparam,
                                                        int                        typei,
                                                        int                        typej,
@@ -530,9 +539,10 @@ gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float*
                                                        float*                     E_lj)
  {
      /* sigma and epsilon are scaled to give 6*C6 */
-    const float sigma = nbfp_comb[2 * typei] + nbfp_comb[2 * typej];
-
-    const float epsilon = nbfp_comb[2 * typei + 1] * nbfp_comb[2 * typej + 1];
+    const float2 c6c12_i = nbfp_comb[typei];
+    const float2 c6c12_j = nbfp_comb[typej];
+    const float  sigma   = c6c12_i.x + c6c12_j.x;
+    const float  epsilon = c6c12_i.y + c6c12_j.y;
  
      const float sigma2 = sigma * sigma;
      const float c6grid = epsilon * sigma2 * sigma2 * sigma2;
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp

index 0f67f04f0a0bf181ef5dc48c085bd53145d5dc7f..b7ad72a6a0f8f622bedac3d74c01d2648d8b804f 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
@@ -124,15 +124,26 @@ static void initNbparam(NBParamGpu*                     nbp,
      /* set up LJ parameter lookup table */
      if (!useLjCombRule(nbp->vdwType))
      {
-        initParamLookupTable(
-                &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * numTypes * numTypes, deviceContext);
+        static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp,
+                             &nbp->nbfp_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+                             numTypes * numTypes,
+                             deviceContext);
      }
  
      /* set up LJ-PME parameter lookup table */
      if (ic.vdwtype == VanDerWaalsType::Pme)
      {
-        initParamLookupTable(
-                &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * numTypes, deviceContext);
+        static_assert(sizeof(decltype(nbp->nbfp_comb))
+                              == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp_comb,
+                             &nbp->nbfp_comb_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+                             numTypes,
+                             deviceContext);
      }
  }
  
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp

index a372afa77542b646d53bf6a2e413db20558a7f3b..504557d5630c8e14da7bed76015157edae476bb4 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -100,15 +100,14 @@ using cl::sycl::access::fence_space;
  using cl::sycl::access::mode;
  using cl::sycl::access::target;
  
-static inline void convertSigmaEpsilonToC6C12(const float                  sigma,
-                                              const float                  epsilon,
-                                              cl::sycl::private_ptr<float> c6,
-                                              cl::sycl::private_ptr<float> c12)
+static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float epsilon)
  {
      const float sigma2 = sigma * sigma;
      const float sigma6 = sigma2 * sigma2 * sigma2;
-    *c6                = epsilon * sigma6;
-    *c12               = (*c6) * sigma6;
+    const float c6     = epsilon * sigma6;
+    const float c12    = c6 * sigma6;
+
+    return Float2(c6, c12);
  }
  
  template<bool doCalcEnergies>
@@ -147,25 +146,23 @@ static inline void ljForceSwitch(const shift_consts_t         dispersionShift,
  
  //! \brief Fetch C6 grid contribution coefficients and return the product of these.
  template<enum VdwType vdwType>
-static inline float calculateLJEwaldC6Grid(const DeviceAccessor<float, mode::read> a_nbfpComb,
-                                           const int                               typeI,
-                                           const int                               typeJ)
+static inline float calculateLJEwaldC6Grid(const DeviceAccessor<Float2, mode::read> a_nbfpComb,
+                                           const int                                typeI,
+                                           const int                                typeJ)
  {
      if constexpr (vdwType == VdwType::EwaldGeom)
      {
-        return a_nbfpComb[2 * typeI] * a_nbfpComb[2 * typeJ];
+        return a_nbfpComb[typeI][0] * a_nbfpComb[typeJ][0];
      }
      else
      {
          static_assert(vdwType == VdwType::EwaldLB);
          /* sigma and epsilon are scaled to give 6*C6 */
-        const float c6_i  = a_nbfpComb[2 * typeI];
-        const float c12_i = a_nbfpComb[2 * typeI + 1];
-        const float c6_j  = a_nbfpComb[2 * typeJ];
-        const float c12_j = a_nbfpComb[2 * typeJ + 1];
+        const Float2 c6c12_i = a_nbfpComb[typeI];
+        const Float2 c6c12_j = a_nbfpComb[typeJ];
  
-        const float sigma   = c6_i + c6_j;
-        const float epsilon = c12_i * c12_j;
+        const float sigma   = c6c12_i[0] + c6c12_j[0];
+        const float epsilon = c6c12_i[1] * c6c12_j[1];
  
          const float sigma2 = sigma * sigma;
          return epsilon * sigma2 * sigma2 * sigma2;
@@ -174,17 +171,17 @@ static inline float calculateLJEwaldC6Grid(const DeviceAccessor<float, mode::rea
  
  //! Calculate LJ-PME grid force contribution with geometric or LB combination rule.
  template<bool doCalcEnergies, enum VdwType vdwType>
-static inline void ljEwaldComb(const DeviceAccessor<float, mode::read> a_nbfpComb,
-                               const float                             sh_lj_ewald,
-                               const int                               typeI,
-                               const int                               typeJ,
-                               const float                             r2,
-                               const float                             r2Inv,
-                               const float                             lje_coeff2,
-                               const float                             lje_coeff6_6,
-                               const float                             int_bit,
-                               cl::sycl::private_ptr<float>            fInvR,
-                               cl::sycl::private_ptr<float>            eLJ)
+static inline void ljEwaldComb(const DeviceAccessor<Float2, mode::read> a_nbfpComb,
+                               const float                              sh_lj_ewald,
+                               const int                                typeI,
+                               const int                                typeJ,
+                               const float                              r2,
+                               const float                              r2Inv,
+                               const float                              lje_coeff2,
+                               const float                              lje_coeff6_6,
+                               const float                              int_bit,
+                               cl::sycl::private_ptr<float>             fInvR,
+                               cl::sycl::private_ptr<float>             eLJ)
  {
      const float c6grid = calculateLJEwaldC6Grid<vdwType>(a_nbfpComb, typeI, typeJ);
  
@@ -429,8 +426,8 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                   DeviceAccessor<nbnxn_excl_t, mode::read>                    a_plistExcl,
                   OptionalAccessor<Float2, mode::read, ljComb<vdwType>>       a_ljComb,
                   OptionalAccessor<int, mode::read, !ljComb<vdwType>>         a_atomTypes,
-                 OptionalAccessor<float, mode::read, !ljComb<vdwType>>       a_nbfp,
-                 OptionalAccessor<float, mode::read, ljEwald<vdwType>>       a_nbfpComb,
+                 OptionalAccessor<Float2, mode::read, !ljComb<vdwType>>      a_nbfp,
+                 OptionalAccessor<Float2, mode::read, ljEwald<vdwType>>      a_nbfpComb,
                   OptionalAccessor<float, mode::read, elecEwaldTab<elecType>> a_coulombTab,
                   const int                                                   numTypes,
                   const float                                                 rCoulombSq,
@@ -609,7 +606,7 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                      {
                          energyVdw +=
                                  a_nbfp[a_atomTypes[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
-                                       * (numTypes + 1) * 2];
+                                       * (numTypes + 1)][0];
                      }
                  }
                  /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
@@ -712,23 +709,21 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                          {
                              const float qi = xqi[3];
                              int         atomTypeI; // Only needed if (!props.vdwComb)
-                            float       c6, c12, sigma, epsilon;
+                            float       sigma, epsilon;
+                            Float2      c6c12;
  
                              if constexpr (!props.vdwComb)
                              {
                                  /* LJ 6*C6 and 12*C12 */
-                                atomTypeI     = sm_atomTypeI[i][tidxi];
-                                const int idx = (numTypes * atomTypeI + atomTypeJ) * 2;
-                                c6            = a_nbfp[idx]; // TODO: Make a_nbfm into float2
-                                c12           = a_nbfp[idx + 1];
+                                atomTypeI = sm_atomTypeI[i][tidxi];
+                                c6c12     = a_nbfp[numTypes * atomTypeI + atomTypeJ];
                              }
                              else
                              {
                                  const Float2 ljCombI = sm_ljCombI[i][tidxi];
                                  if constexpr (props.vdwCombGeom)
                                  {
-                                    c6  = ljCombI[0] * ljCombJ[0];
-                                    c12 = ljCombI[1] * ljCombJ[1];
+                                    c6c12 = Float2(ljCombI[0] * ljCombJ[0], ljCombI[1] * ljCombJ[1]);
                                  }
                                  else
                                  {
@@ -738,11 +733,15 @@ auto nbnxmKernel(cl::sycl::handler&                                   cgh,
                                      epsilon = ljCombI[1] * ljCombJ[1];
                                      if constexpr (doCalcEnergies)
                                      {
-                                        convertSigmaEpsilonToC6C12(sigma, epsilon, &c6, &c12);
+                                        c6c12 = convertSigmaEpsilonToC6C12(sigma, epsilon);
                                      }
                                  } // props.vdwCombGeom
                              }     // !props.vdwComb
  
+                            // c6 and c12 are unused and garbage iff props.vdwCombLB && !doCalcEnergies
+                            const float c6  = c6c12[0];
+                            const float c12 = c6c12[1];
+
                              // Ensure distance do not become so small that r^-12 overflows
                              r2 = std::max(r2, c_nbnxnMinDistanceSquared);
  #if GMX_SYCL_HIPSYCL
author	Szilárd Páll <pall.szilard@gmail.com>
	Thu, 4 Mar 2021 08:08:51 +0000 (08:08 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 4 Mar 2021 08:08:51 +0000 (08:08 +0000)
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh		patch \| blob \| history
src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh		patch \| blob \| history
src/gromacs/nbnxm/gpu_types_common.h		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh		patch \| blob \| history
src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp		patch \| blob \| history