From: Szilárd Páll Date: Thu, 4 Mar 2021 08:08:51 +0000 (+0000) Subject: Simplify LJ parameter lookup X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=d440c46ef06c3a4c1695e3ec37e7e108ab7a2817;p=alexxy%2Fgromacs.git Simplify LJ parameter lookup - Unify the CUDA per-pair scaled C6/C12 LJ parameter loading codepaths so both texture and LDG paths do 8-byte float2 loads. - Unify the CUDA per-atom LJ parameter loading codepaths so both texture and LDG paths do 8-byte float2 loads. - Simplify the LJ self interaction energy term calculation by only using the non-texture loads path to simplify code. As this is outside the main loop, its impact is negligible. - Use float2 loads for the above in OpenCL as well. --- diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index f4467c0248..692f4076a6 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -151,15 +151,26 @@ static void init_nbparam(NBParamGpu* nbp, /* set up LJ parameter lookup table */ if (!useLjCombRule(nbp->vdwType)) { - initParamLookupTable( - &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * ntypes * ntypes, deviceContext); + static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())), + "Mismatch in the size of host / device data types"); + initParamLookupTable(&nbp->nbfp, + &nbp->nbfp_texobj, + reinterpret_cast(nbatParams.nbfp.data()), + ntypes * ntypes, + deviceContext); } /* set up LJ-PME parameter lookup table */ if (ic->vdwtype == VanDerWaalsType::Pme) { - initParamLookupTable( - &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * ntypes, deviceContext); + static_assert(sizeof(decltype(nbp->nbfp_comb)) + == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())), + "Mismatch in the size of host / device data types"); + initParamLookupTable(&nbp->nbfp_comb, + &nbp->nbfp_comb_texobj, + reinterpret_cast(nbatParams.nbfp_comb.data()), + ntypes, + deviceContext); } } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh index 344e971c84..6f90d00694 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh @@ -336,15 +336,9 @@ __launch_bounds__(THREADS_PER_BLOCK) # endif # ifdef LJ_EWALD -# if DISABLE_CUDA_TEXTURES - E_lj += LDG(&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] - * (ntypes + 1) * 2]); -# else - E_lj += tex1Dfetch( - nbparam.nbfp_texobj, - atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] - * (ntypes + 1) * 2); -# endif + // load only the first 4 bytes of the parameter pair (equivalent with nbfp[idx].x) + E_lj += LDG((float*)&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] + * (ntypes + 1)]); # endif } diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh index 4850298f8f..b916997a22 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_utils.cuh @@ -2,7 +2,7 @@ * This file is part of the GROMACS molecular simulation package. * * Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team. - * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -204,11 +204,13 @@ static __forceinline__ __device__ void static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const NBParamGpu nbparam, int typei, int typej) { # if DISABLE_CUDA_TEXTURES - return LDG(&nbparam.nbfp_comb[2 * typei]) * LDG(&nbparam.nbfp_comb[2 * typej]); + float c6_i = LDG(&nbparam.nbfp_comb[typei]).x; + float c6_j = LDG(&nbparam.nbfp_comb[typej]).x; # else - return tex1Dfetch(nbparam.nbfp_comb_texobj, 2 * typei) - * tex1Dfetch(nbparam.nbfp_comb_texobj, 2 * typej); + float c6_i = tex1Dfetch(nbparam.nbfp_comb_texobj, typei).x; + float c6_j = tex1Dfetch(nbparam.nbfp_comb_texobj, typej).x; # endif /* DISABLE_CUDA_TEXTURES */ + return c6_i * c6_j; } @@ -278,19 +280,11 @@ static __forceinline__ __device__ void calculate_lj_ewald_comb_geom_F_E(const NB */ static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const NBParamGpu nbparam, int type) { - float2 c6c12; # if DISABLE_CUDA_TEXTURES - /* Force an 8-byte fetch to save a memory instruction. */ - float2* nbfp_comb = (float2*)nbparam.nbfp_comb; - c6c12 = LDG(&nbfp_comb[type]); + return LDG(&nbparam.nbfp_comb[type]); # else - /* NOTE: as we always do 8-byte aligned loads, we could - fetch float2 here too just as above. */ - c6c12.x = tex1Dfetch(nbparam.nbfp_comb_texobj, 2 * type); - c6c12.y = tex1Dfetch(nbparam.nbfp_comb_texobj, 2 * type + 1); + return tex1Dfetch(nbparam.nbfp_comb_texobj, type); # endif /* DISABLE_CUDA_TEXTURES */ - - return c6c12; } @@ -357,8 +351,8 @@ static __forceinline__ __device__ float2 fetch_coulomb_force_r(const NBParamGpu d.x = LDG(&nbparam.coulomb_tab[index]); d.y = LDG(&nbparam.coulomb_tab[index + 1]); # else - d.x = tex1Dfetch(nbparam.coulomb_tab_texobj, index); - d.y = tex1Dfetch(nbparam.coulomb_tab_texobj, index + 1); + d.x = tex1Dfetch(nbparam.coulomb_tab_texobj, index); + d.y = tex1Dfetch(nbparam.coulomb_tab_texobj, index + 1); # endif // DISABLE_CUDA_TEXTURES return d; @@ -397,19 +391,14 @@ static __forceinline__ __device__ float interpolate_coulomb_force_r(const NBPara */ static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex) { + float2 c6c12; # if DISABLE_CUDA_TEXTURES - /* Force an 8-byte fetch to save a memory instruction. */ - float2* nbfp = (float2*)nbparam.nbfp; - float2 c6c12; - c6c12 = LDG(&nbfp[baseIndex]); - c6 = c6c12.x; - c12 = c6c12.y; + c6c12 = LDG(&nbparam.nbfp[baseIndex]); # else - /* NOTE: as we always do 8-byte aligned loads, we could - fetch float2 here too just as above. */ - c6 = tex1Dfetch(nbparam.nbfp_texobj, 2 * baseIndex); - c12 = tex1Dfetch(nbparam.nbfp_texobj, 2 * baseIndex + 1); + c6c12 = tex1Dfetch(nbparam.nbfp_texobj, baseIndex); # endif // DISABLE_CUDA_TEXTURES + c6 = c6c12.x; + c12 = c6c12.y; } diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h index 4b7f0e762e..ee6357e3f6 100644 --- a/src/gromacs/nbnxm/gpu_types_common.h +++ b/src/gromacs/nbnxm/gpu_types_common.h @@ -175,12 +175,12 @@ struct NBParamGpu switch_consts_t vdw_switch; /* LJ non-bonded parameters - accessed through texture memory */ - //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements - DeviceBuffer nbfp; + //! nonbonded parameter table with 6*C6/12*C12 pairs per atom type-pair, ntype^2 elements + DeviceBuffer nbfp; //! texture object bound to nbfp DeviceTexture nbfp_texobj; - //! nonbonded parameter table per atom type, 2*ntype elements - DeviceBuffer nbfp_comb; + //! nonbonded parameter table per atom type, ntype elements + DeviceBuffer nbfp_comb; //! texture object bound to nbfp_comb DeviceTexture nbfp_comb_texobj; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index a2ee20416f..0583d0a73d 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -163,19 +163,28 @@ static void init_nbparam(NBParamGpu* nbp, allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext); } - const int nnbfp = 2 * nbatParams.numTypes * nbatParams.numTypes; - const int nnbfp_comb = 2 * nbatParams.numTypes; - { /* set up LJ parameter lookup table */ - DeviceBuffer nbfp; - initParamLookupTable(&nbfp, nullptr, nbatParams.nbfp.data(), nnbfp, deviceContext); + static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp.data())), + "Mismatch in the size of host / device data types"); + DeviceBuffer nbfp; + initParamLookupTable(&nbfp, + nullptr, + reinterpret_cast(nbatParams.nbfp.data()), + nbatParams.numTypes * nbatParams.numTypes, + deviceContext); nbp->nbfp = nbfp; if (ic->vdwtype == VanDerWaalsType::Pme) { - DeviceBuffer nbfp_comb; - initParamLookupTable(&nbfp_comb, nullptr, nbatParams.nbfp_comb.data(), nnbfp_comb, deviceContext); + static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())), + "Mismatch in the size of host / device data types"); + DeviceBuffer nbfp_comb; + initParamLookupTable(&nbfp_comb, + nullptr, + reinterpret_cast(nbatParams.nbfp_comb.data()), + nbatParams.numTypes, + deviceContext); nbp->nbfp_comb = nbfp_comb; } } @@ -482,8 +491,9 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam->vdwType)) { - static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()), - "Size of the LJ parameters element should be equal to the size of float2."); + static_assert( + sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()), + "Size of a pair of LJ parameters elements should be equal to the size of Float2."); copyToDeviceBuffer(&d_atdat->ljComb, reinterpret_cast(nbat->params().lj_comb.data()), 0, diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh index 7a55ca8e2d..e51957eae6 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel.clh @@ -126,11 +126,11 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #else const __global int* restrict atom_types, /* IN */ #endif - const __global float* restrict shift_vec, /* IN stores float3 values */ - __constant const float* gmx_unused nbfp, /* IN */ - __constant const float* gmx_unused nbfp_comb, /* IN */ - __constant const float* gmx_unused coulomb_tab, /* IN */ - const __global nbnxn_sci_t* pl_sci, /* IN */ + const __global float* restrict shift_vec, /* IN stores float3 values */ + __constant const float2* restrict gmx_unused nbfp, /* IN */ + __constant const float2* restrict gmx_unused nbfp_comb, /* IN */ + __constant const float* restrict gmx_unused coulomb_tab, /* IN */ + const __global nbnxn_sci_t* pl_sci, /* IN */ #ifndef PRUNE_NBL const #endif @@ -286,8 +286,8 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) E_el += qi * qi; # endif # if defined LJ_EWALD - E_lj += nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi] - * (ntypes + 1) * 2]; + E_lj += nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi] * (ntypes + 1)] + .x; # endif /* LJ_EWALD */ } @@ -411,8 +411,10 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl) #endif /* IATYPE_SHMEM */ /* LJ 6*C6 and 12*C12 */ #ifndef LJ_COMB - const float c6 = nbfp[2 * (ntypes * typei + typej)]; - const float c12 = nbfp[2 * (ntypes * typei + typej) + 1]; + const float2 c6c12 = nbfp[ntypes * typei + typej]; + + const float c6 = c6c12.x; + const float c12 = c6c12.y; #else /* LJ_COMB */ # ifdef LJ_COMB_GEOM const float c6 = ljcp_i.x * ljcp_j.x; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh index b3cb444857..fa942547ba 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_kernel_utils.clh @@ -456,19 +456,28 @@ gmx_opencl_inline void calculate_potential_switch_F_E(const cl_nbparam_params_t* *E_lj *= sw; } +/*! \brief Fetch C6 grid contribution coefficients and return the product of these. + */ +gmx_opencl_inline float calculate_lj_ewald_c6grid(__constant const float2* nbfp_comb, int typei, int typej) +{ + const float c6_i = nbfp_comb[typei].x; + const float c6_j = nbfp_comb[typej].x; + return c6_i * c6_j; +} + /*! Calculate LJ-PME grid force contribution with * geometric combination rule. */ -gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nbfp_comb, - int typei, - int typej, - float r2, - float inv_r2, - float lje_coeff2, - float lje_coeff6_6, - float* F_invr) +gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float2* nbfp_comb, + int typei, + int typej, + float r2, + float inv_r2, + float lje_coeff2, + float lje_coeff6_6, + float* F_invr) { - const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej]; + const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej); /* Recalculate inv_r6 without exclusion mask */ const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2; @@ -483,7 +492,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nb /*! Calculate LJ-PME grid force + energy contribution with * geometric combination rule. */ -gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float* nbfp_comb, +gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float2* nbfp_comb, const cl_nbparam_params_t* nbparam, int typei, int typej, @@ -495,7 +504,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float* float* F_invr, float* E_lj) { - const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej]; + const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej); /* Recalculate inv_r6 without exclusion mask */ const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2; @@ -516,7 +525,7 @@ gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float* * We use a single F+E kernel with conditional because the performance impact * of this is pretty small and LB on the CPU is anyway very slow. */ -gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float* nbfp_comb, +gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float2* nbfp_comb, const cl_nbparam_params_t* nbparam, int typei, int typej, @@ -530,9 +539,10 @@ gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float* float* E_lj) { /* sigma and epsilon are scaled to give 6*C6 */ - const float sigma = nbfp_comb[2 * typei] + nbfp_comb[2 * typej]; - - const float epsilon = nbfp_comb[2 * typei + 1] * nbfp_comb[2 * typej + 1]; + const float2 c6c12_i = nbfp_comb[typei]; + const float2 c6c12_j = nbfp_comb[typej]; + const float sigma = c6c12_i.x + c6c12_j.x; + const float epsilon = c6c12_i.y + c6c12_j.y; const float sigma2 = sigma * sigma; const float c6grid = epsilon * sigma2 * sigma2 * sigma2; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp index 0f67f04f0a..b7ad72a6a0 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp @@ -124,15 +124,26 @@ static void initNbparam(NBParamGpu* nbp, /* set up LJ parameter lookup table */ if (!useLjCombRule(nbp->vdwType)) { - initParamLookupTable( - &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * numTypes * numTypes, deviceContext); + static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())), + "Mismatch in the size of host / device data types"); + initParamLookupTable(&nbp->nbfp, + &nbp->nbfp_texobj, + reinterpret_cast(nbatParams.nbfp.data()), + numTypes * numTypes, + deviceContext); } /* set up LJ-PME parameter lookup table */ if (ic.vdwtype == VanDerWaalsType::Pme) { - initParamLookupTable( - &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * numTypes, deviceContext); + static_assert(sizeof(decltype(nbp->nbfp_comb)) + == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())), + "Mismatch in the size of host / device data types"); + initParamLookupTable(&nbp->nbfp_comb, + &nbp->nbfp_comb_texobj, + reinterpret_cast(nbatParams.nbfp_comb.data()), + numTypes, + deviceContext); } } diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index a372afa775..504557d563 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -100,15 +100,14 @@ using cl::sycl::access::fence_space; using cl::sycl::access::mode; using cl::sycl::access::target; -static inline void convertSigmaEpsilonToC6C12(const float sigma, - const float epsilon, - cl::sycl::private_ptr c6, - cl::sycl::private_ptr c12) +static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float epsilon) { const float sigma2 = sigma * sigma; const float sigma6 = sigma2 * sigma2 * sigma2; - *c6 = epsilon * sigma6; - *c12 = (*c6) * sigma6; + const float c6 = epsilon * sigma6; + const float c12 = c6 * sigma6; + + return Float2(c6, c12); } template @@ -147,25 +146,23 @@ static inline void ljForceSwitch(const shift_consts_t dispersionShift, //! \brief Fetch C6 grid contribution coefficients and return the product of these. template -static inline float calculateLJEwaldC6Grid(const DeviceAccessor a_nbfpComb, - const int typeI, - const int typeJ) +static inline float calculateLJEwaldC6Grid(const DeviceAccessor a_nbfpComb, + const int typeI, + const int typeJ) { if constexpr (vdwType == VdwType::EwaldGeom) { - return a_nbfpComb[2 * typeI] * a_nbfpComb[2 * typeJ]; + return a_nbfpComb[typeI][0] * a_nbfpComb[typeJ][0]; } else { static_assert(vdwType == VdwType::EwaldLB); /* sigma and epsilon are scaled to give 6*C6 */ - const float c6_i = a_nbfpComb[2 * typeI]; - const float c12_i = a_nbfpComb[2 * typeI + 1]; - const float c6_j = a_nbfpComb[2 * typeJ]; - const float c12_j = a_nbfpComb[2 * typeJ + 1]; + const Float2 c6c12_i = a_nbfpComb[typeI]; + const Float2 c6c12_j = a_nbfpComb[typeJ]; - const float sigma = c6_i + c6_j; - const float epsilon = c12_i * c12_j; + const float sigma = c6c12_i[0] + c6c12_j[0]; + const float epsilon = c6c12_i[1] * c6c12_j[1]; const float sigma2 = sigma * sigma; return epsilon * sigma2 * sigma2 * sigma2; @@ -174,17 +171,17 @@ static inline float calculateLJEwaldC6Grid(const DeviceAccessor -static inline void ljEwaldComb(const DeviceAccessor a_nbfpComb, - const float sh_lj_ewald, - const int typeI, - const int typeJ, - const float r2, - const float r2Inv, - const float lje_coeff2, - const float lje_coeff6_6, - const float int_bit, - cl::sycl::private_ptr fInvR, - cl::sycl::private_ptr eLJ) +static inline void ljEwaldComb(const DeviceAccessor a_nbfpComb, + const float sh_lj_ewald, + const int typeI, + const int typeJ, + const float r2, + const float r2Inv, + const float lje_coeff2, + const float lje_coeff6_6, + const float int_bit, + cl::sycl::private_ptr fInvR, + cl::sycl::private_ptr eLJ) { const float c6grid = calculateLJEwaldC6Grid(a_nbfpComb, typeI, typeJ); @@ -429,8 +426,8 @@ auto nbnxmKernel(cl::sycl::handler& cgh, DeviceAccessor a_plistExcl, OptionalAccessor> a_ljComb, OptionalAccessor> a_atomTypes, - OptionalAccessor> a_nbfp, - OptionalAccessor> a_nbfpComb, + OptionalAccessor> a_nbfp, + OptionalAccessor> a_nbfpComb, OptionalAccessor> a_coulombTab, const int numTypes, const float rCoulombSq, @@ -609,7 +606,7 @@ auto nbnxmKernel(cl::sycl::handler& cgh, { energyVdw += a_nbfp[a_atomTypes[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi] - * (numTypes + 1) * 2]; + * (numTypes + 1)][0]; } } /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */ @@ -712,23 +709,21 @@ auto nbnxmKernel(cl::sycl::handler& cgh, { const float qi = xqi[3]; int atomTypeI; // Only needed if (!props.vdwComb) - float c6, c12, sigma, epsilon; + float sigma, epsilon; + Float2 c6c12; if constexpr (!props.vdwComb) { /* LJ 6*C6 and 12*C12 */ - atomTypeI = sm_atomTypeI[i][tidxi]; - const int idx = (numTypes * atomTypeI + atomTypeJ) * 2; - c6 = a_nbfp[idx]; // TODO: Make a_nbfm into float2 - c12 = a_nbfp[idx + 1]; + atomTypeI = sm_atomTypeI[i][tidxi]; + c6c12 = a_nbfp[numTypes * atomTypeI + atomTypeJ]; } else { const Float2 ljCombI = sm_ljCombI[i][tidxi]; if constexpr (props.vdwCombGeom) { - c6 = ljCombI[0] * ljCombJ[0]; - c12 = ljCombI[1] * ljCombJ[1]; + c6c12 = Float2(ljCombI[0] * ljCombJ[0], ljCombI[1] * ljCombJ[1]); } else { @@ -738,11 +733,15 @@ auto nbnxmKernel(cl::sycl::handler& cgh, epsilon = ljCombI[1] * ljCombJ[1]; if constexpr (doCalcEnergies) { - convertSigmaEpsilonToC6C12(sigma, epsilon, &c6, &c12); + c6c12 = convertSigmaEpsilonToC6C12(sigma, epsilon); } } // props.vdwCombGeom } // !props.vdwComb + // c6 and c12 are unused and garbage iff props.vdwCombLB && !doCalcEnergies + const float c6 = c6c12[0]; + const float c12 = c6c12[1]; + // Ensure distance do not become so small that r^-12 overflows r2 = std::max(r2, c_nbnxnMinDistanceSquared); #if GMX_SYCL_HIPSYCL