/* set up LJ parameter lookup table */
if (!useLjCombRule(nbp->vdwType))
{
- initParamLookupTable(
- &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * ntypes * ntypes, deviceContext);
+ static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+ "Mismatch in the size of host / device data types");
+ initParamLookupTable(&nbp->nbfp,
+ &nbp->nbfp_texobj,
+ reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+ ntypes * ntypes,
+ deviceContext);
}
/* set up LJ-PME parameter lookup table */
if (ic->vdwtype == VanDerWaalsType::Pme)
{
- initParamLookupTable(
- &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * ntypes, deviceContext);
+ static_assert(sizeof(decltype(nbp->nbfp_comb))
+ == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+ "Mismatch in the size of host / device data types");
+ initParamLookupTable(&nbp->nbfp_comb,
+ &nbp->nbfp_comb_texobj,
+ reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+ ntypes,
+ deviceContext);
}
}
# endif
# ifdef LJ_EWALD
-# if DISABLE_CUDA_TEXTURES
- E_lj += LDG(&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
- * (ntypes + 1) * 2]);
-# else
- E_lj += tex1Dfetch<float>(
- nbparam.nbfp_texobj,
- atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
- * (ntypes + 1) * 2);
-# endif
+ // load only the first 4 bytes of the parameter pair (equivalent with nbfp[idx].x)
+ E_lj += LDG((float*)&nbparam.nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+ * (ntypes + 1)]);
# endif
}
* This file is part of the GROMACS molecular simulation package.
*
* Copyright (c) 2012,2013,2014,2016,2017 by the GROMACS development team.
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
static __forceinline__ __device__ float calculate_lj_ewald_c6grid(const NBParamGpu nbparam, int typei, int typej)
{
# if DISABLE_CUDA_TEXTURES
- return LDG(&nbparam.nbfp_comb[2 * typei]) * LDG(&nbparam.nbfp_comb[2 * typej]);
+ float c6_i = LDG(&nbparam.nbfp_comb[typei]).x;
+ float c6_j = LDG(&nbparam.nbfp_comb[typej]).x;
# else
- return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typei)
- * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * typej);
+ float c6_i = tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, typei).x;
+ float c6_j = tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, typej).x;
# endif /* DISABLE_CUDA_TEXTURES */
+ return c6_i * c6_j;
}
*/
static __forceinline__ __device__ float2 fetch_nbfp_comb_c6_c12(const NBParamGpu nbparam, int type)
{
- float2 c6c12;
# if DISABLE_CUDA_TEXTURES
- /* Force an 8-byte fetch to save a memory instruction. */
- float2* nbfp_comb = (float2*)nbparam.nbfp_comb;
- c6c12 = LDG(&nbfp_comb[type]);
+ return LDG(&nbparam.nbfp_comb[type]);
# else
- /* NOTE: as we always do 8-byte aligned loads, we could
- fetch float2 here too just as above. */
- c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type);
- c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2 * type + 1);
+ return tex1Dfetch<float2>(nbparam.nbfp_comb_texobj, type);
# endif /* DISABLE_CUDA_TEXTURES */
-
- return c6c12;
}
d.x = LDG(&nbparam.coulomb_tab[index]);
d.y = LDG(&nbparam.coulomb_tab[index + 1]);
# else
- d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
- d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
+ d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
+ d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
# endif // DISABLE_CUDA_TEXTURES
return d;
*/
static __forceinline__ __device__ void fetch_nbfp_c6_c12(float& c6, float& c12, const NBParamGpu nbparam, int baseIndex)
{
+ float2 c6c12;
# if DISABLE_CUDA_TEXTURES
- /* Force an 8-byte fetch to save a memory instruction. */
- float2* nbfp = (float2*)nbparam.nbfp;
- float2 c6c12;
- c6c12 = LDG(&nbfp[baseIndex]);
- c6 = c6c12.x;
- c12 = c6c12.y;
+ c6c12 = LDG(&nbparam.nbfp[baseIndex]);
# else
- /* NOTE: as we always do 8-byte aligned loads, we could
- fetch float2 here too just as above. */
- c6 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex);
- c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * baseIndex + 1);
+ c6c12 = tex1Dfetch<float2>(nbparam.nbfp_texobj, baseIndex);
# endif // DISABLE_CUDA_TEXTURES
+ c6 = c6c12.x;
+ c12 = c6c12.y;
}
switch_consts_t vdw_switch;
/* LJ non-bonded parameters - accessed through texture memory */
- //! nonbonded parameter table with C6/C12 pairs per atom type-pair, 2*ntype^2 elements
- DeviceBuffer<float> nbfp;
+ //! nonbonded parameter table with 6*C6/12*C12 pairs per atom type-pair, ntype^2 elements
+ DeviceBuffer<Float2> nbfp;
//! texture object bound to nbfp
DeviceTexture nbfp_texobj;
- //! nonbonded parameter table per atom type, 2*ntype elements
- DeviceBuffer<float> nbfp_comb;
+ //! nonbonded parameter table per atom type, ntype elements
+ DeviceBuffer<Float2> nbfp_comb;
//! texture object bound to nbfp_comb
DeviceTexture nbfp_comb_texobj;
allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
}
- const int nnbfp = 2 * nbatParams.numTypes * nbatParams.numTypes;
- const int nnbfp_comb = 2 * nbatParams.numTypes;
-
{
/* set up LJ parameter lookup table */
- DeviceBuffer<real> nbfp;
- initParamLookupTable(&nbfp, nullptr, nbatParams.nbfp.data(), nnbfp, deviceContext);
+ static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+ "Mismatch in the size of host / device data types");
+ DeviceBuffer<Float2> nbfp;
+ initParamLookupTable(&nbfp,
+ nullptr,
+ reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+ nbatParams.numTypes * nbatParams.numTypes,
+ deviceContext);
nbp->nbfp = nbfp;
if (ic->vdwtype == VanDerWaalsType::Pme)
{
- DeviceBuffer<float> nbfp_comb;
- initParamLookupTable(&nbfp_comb, nullptr, nbatParams.nbfp_comb.data(), nnbfp_comb, deviceContext);
+ static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+ "Mismatch in the size of host / device data types");
+ DeviceBuffer<Float2> nbfp_comb;
+ initParamLookupTable(&nbfp_comb,
+ nullptr,
+ reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+ nbatParams.numTypes,
+ deviceContext);
nbp->nbfp_comb = nbfp_comb;
}
}
if (useLjCombRule(nb->nbparam->vdwType))
{
- static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
- "Size of the LJ parameters element should be equal to the size of float2.");
+ static_assert(
+ sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
+ "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
copyToDeviceBuffer(&d_atdat->ljComb,
reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
0,
#else
const __global int* restrict atom_types, /* IN */
#endif
- const __global float* restrict shift_vec, /* IN stores float3 values */
- __constant const float* gmx_unused nbfp, /* IN */
- __constant const float* gmx_unused nbfp_comb, /* IN */
- __constant const float* gmx_unused coulomb_tab, /* IN */
- const __global nbnxn_sci_t* pl_sci, /* IN */
+ const __global float* restrict shift_vec, /* IN stores float3 values */
+ __constant const float2* restrict gmx_unused nbfp, /* IN */
+ __constant const float2* restrict gmx_unused nbfp_comb, /* IN */
+ __constant const float* restrict gmx_unused coulomb_tab, /* IN */
+ const __global nbnxn_sci_t* pl_sci, /* IN */
#ifndef PRUNE_NBL
const
#endif
E_el += qi * qi;
# endif
# if defined LJ_EWALD
- E_lj += nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi]
- * (ntypes + 1) * 2];
+ E_lj += nbfp[atom_types[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * CL_SIZE + tidxi] * (ntypes + 1)]
+ .x;
# endif /* LJ_EWALD */
}
#endif /* IATYPE_SHMEM */
/* LJ 6*C6 and 12*C12 */
#ifndef LJ_COMB
- const float c6 = nbfp[2 * (ntypes * typei + typej)];
- const float c12 = nbfp[2 * (ntypes * typei + typej) + 1];
+ const float2 c6c12 = nbfp[ntypes * typei + typej];
+
+ const float c6 = c6c12.x;
+ const float c12 = c6c12.y;
#else /* LJ_COMB */
# ifdef LJ_COMB_GEOM
const float c6 = ljcp_i.x * ljcp_j.x;
*E_lj *= sw;
}
+/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
+ */
+gmx_opencl_inline float calculate_lj_ewald_c6grid(__constant const float2* nbfp_comb, int typei, int typej)
+{
+ const float c6_i = nbfp_comb[typei].x;
+ const float c6_j = nbfp_comb[typej].x;
+ return c6_i * c6_j;
+}
+
/*! Calculate LJ-PME grid force contribution with
* geometric combination rule.
*/
-gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float* nbfp_comb,
- int typei,
- int typej,
- float r2,
- float inv_r2,
- float lje_coeff2,
- float lje_coeff6_6,
- float* F_invr)
+gmx_opencl_inline void calculate_lj_ewald_comb_geom_F(__constant const float2* nbfp_comb,
+ int typei,
+ int typej,
+ float r2,
+ float inv_r2,
+ float lje_coeff2,
+ float lje_coeff6_6,
+ float* F_invr)
{
- const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej];
+ const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);
/* Recalculate inv_r6 without exclusion mask */
const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
/*! Calculate LJ-PME grid force + energy contribution with
* geometric combination rule.
*/
-gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float* nbfp_comb,
+gmx_opencl_inline void calculate_lj_ewald_comb_geom_F_E(__constant const float2* nbfp_comb,
const cl_nbparam_params_t* nbparam,
int typei,
int typej,
float* F_invr,
float* E_lj)
{
- const float c6grid = nbfp_comb[2 * typei] * nbfp_comb[2 * typej];
+ const float c6grid = calculate_lj_ewald_c6grid(nbfp_comb, typei, typej);
/* Recalculate inv_r6 without exclusion mask */
const float inv_r6_nm = inv_r2 * inv_r2 * inv_r2;
* We use a single F+E kernel with conditional because the performance impact
* of this is pretty small and LB on the CPU is anyway very slow.
*/
-gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float* nbfp_comb,
+gmx_opencl_inline void calculate_lj_ewald_comb_LB_F_E(__constant const float2* nbfp_comb,
const cl_nbparam_params_t* nbparam,
int typei,
int typej,
float* E_lj)
{
/* sigma and epsilon are scaled to give 6*C6 */
- const float sigma = nbfp_comb[2 * typei] + nbfp_comb[2 * typej];
-
- const float epsilon = nbfp_comb[2 * typei + 1] * nbfp_comb[2 * typej + 1];
+ const float2 c6c12_i = nbfp_comb[typei];
+ const float2 c6c12_j = nbfp_comb[typej];
+ const float sigma = c6c12_i.x + c6c12_j.x;
+ const float epsilon = c6c12_i.y + c6c12_j.y;
const float sigma2 = sigma * sigma;
const float c6grid = epsilon * sigma2 * sigma2 * sigma2;
/* set up LJ parameter lookup table */
if (!useLjCombRule(nbp->vdwType))
{
- initParamLookupTable(
- &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * numTypes * numTypes, deviceContext);
+ static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+ "Mismatch in the size of host / device data types");
+ initParamLookupTable(&nbp->nbfp,
+ &nbp->nbfp_texobj,
+ reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+ numTypes * numTypes,
+ deviceContext);
}
/* set up LJ-PME parameter lookup table */
if (ic.vdwtype == VanDerWaalsType::Pme)
{
- initParamLookupTable(
- &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * numTypes, deviceContext);
+ static_assert(sizeof(decltype(nbp->nbfp_comb))
+ == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+ "Mismatch in the size of host / device data types");
+ initParamLookupTable(&nbp->nbfp_comb,
+ &nbp->nbfp_comb_texobj,
+ reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+ numTypes,
+ deviceContext);
}
}
using cl::sycl::access::mode;
using cl::sycl::access::target;
-static inline void convertSigmaEpsilonToC6C12(const float sigma,
- const float epsilon,
- cl::sycl::private_ptr<float> c6,
- cl::sycl::private_ptr<float> c12)
+static inline Float2 convertSigmaEpsilonToC6C12(const float sigma, const float epsilon)
{
const float sigma2 = sigma * sigma;
const float sigma6 = sigma2 * sigma2 * sigma2;
- *c6 = epsilon * sigma6;
- *c12 = (*c6) * sigma6;
+ const float c6 = epsilon * sigma6;
+ const float c12 = c6 * sigma6;
+
+ return Float2(c6, c12);
}
template<bool doCalcEnergies>
//! \brief Fetch C6 grid contribution coefficients and return the product of these.
template<enum VdwType vdwType>
-static inline float calculateLJEwaldC6Grid(const DeviceAccessor<float, mode::read> a_nbfpComb,
- const int typeI,
- const int typeJ)
+static inline float calculateLJEwaldC6Grid(const DeviceAccessor<Float2, mode::read> a_nbfpComb,
+ const int typeI,
+ const int typeJ)
{
if constexpr (vdwType == VdwType::EwaldGeom)
{
- return a_nbfpComb[2 * typeI] * a_nbfpComb[2 * typeJ];
+ return a_nbfpComb[typeI][0] * a_nbfpComb[typeJ][0];
}
else
{
static_assert(vdwType == VdwType::EwaldLB);
/* sigma and epsilon are scaled to give 6*C6 */
- const float c6_i = a_nbfpComb[2 * typeI];
- const float c12_i = a_nbfpComb[2 * typeI + 1];
- const float c6_j = a_nbfpComb[2 * typeJ];
- const float c12_j = a_nbfpComb[2 * typeJ + 1];
+ const Float2 c6c12_i = a_nbfpComb[typeI];
+ const Float2 c6c12_j = a_nbfpComb[typeJ];
- const float sigma = c6_i + c6_j;
- const float epsilon = c12_i * c12_j;
+ const float sigma = c6c12_i[0] + c6c12_j[0];
+ const float epsilon = c6c12_i[1] * c6c12_j[1];
const float sigma2 = sigma * sigma;
return epsilon * sigma2 * sigma2 * sigma2;
//! Calculate LJ-PME grid force contribution with geometric or LB combination rule.
template<bool doCalcEnergies, enum VdwType vdwType>
-static inline void ljEwaldComb(const DeviceAccessor<float, mode::read> a_nbfpComb,
- const float sh_lj_ewald,
- const int typeI,
- const int typeJ,
- const float r2,
- const float r2Inv,
- const float lje_coeff2,
- const float lje_coeff6_6,
- const float int_bit,
- cl::sycl::private_ptr<float> fInvR,
- cl::sycl::private_ptr<float> eLJ)
+static inline void ljEwaldComb(const DeviceAccessor<Float2, mode::read> a_nbfpComb,
+ const float sh_lj_ewald,
+ const int typeI,
+ const int typeJ,
+ const float r2,
+ const float r2Inv,
+ const float lje_coeff2,
+ const float lje_coeff6_6,
+ const float int_bit,
+ cl::sycl::private_ptr<float> fInvR,
+ cl::sycl::private_ptr<float> eLJ)
{
const float c6grid = calculateLJEwaldC6Grid<vdwType>(a_nbfpComb, typeI, typeJ);
DeviceAccessor<nbnxn_excl_t, mode::read> a_plistExcl,
OptionalAccessor<Float2, mode::read, ljComb<vdwType>> a_ljComb,
OptionalAccessor<int, mode::read, !ljComb<vdwType>> a_atomTypes,
- OptionalAccessor<float, mode::read, !ljComb<vdwType>> a_nbfp,
- OptionalAccessor<float, mode::read, ljEwald<vdwType>> a_nbfpComb,
+ OptionalAccessor<Float2, mode::read, !ljComb<vdwType>> a_nbfp,
+ OptionalAccessor<Float2, mode::read, ljEwald<vdwType>> a_nbfpComb,
OptionalAccessor<float, mode::read, elecEwaldTab<elecType>> a_coulombTab,
const int numTypes,
const float rCoulombSq,
{
energyVdw +=
a_nbfp[a_atomTypes[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
- * (numTypes + 1) * 2];
+ * (numTypes + 1)][0];
}
}
/* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
{
const float qi = xqi[3];
int atomTypeI; // Only needed if (!props.vdwComb)
- float c6, c12, sigma, epsilon;
+ float sigma, epsilon;
+ Float2 c6c12;
if constexpr (!props.vdwComb)
{
/* LJ 6*C6 and 12*C12 */
- atomTypeI = sm_atomTypeI[i][tidxi];
- const int idx = (numTypes * atomTypeI + atomTypeJ) * 2;
- c6 = a_nbfp[idx]; // TODO: Make a_nbfm into float2
- c12 = a_nbfp[idx + 1];
+ atomTypeI = sm_atomTypeI[i][tidxi];
+ c6c12 = a_nbfp[numTypes * atomTypeI + atomTypeJ];
}
else
{
const Float2 ljCombI = sm_ljCombI[i][tidxi];
if constexpr (props.vdwCombGeom)
{
- c6 = ljCombI[0] * ljCombJ[0];
- c12 = ljCombI[1] * ljCombJ[1];
+ c6c12 = Float2(ljCombI[0] * ljCombJ[0], ljCombI[1] * ljCombJ[1]);
}
else
{
epsilon = ljCombI[1] * ljCombJ[1];
if constexpr (doCalcEnergies)
{
- convertSigmaEpsilonToC6C12(sigma, epsilon, &c6, &c12);
+ c6c12 = convertSigmaEpsilonToC6C12(sigma, epsilon);
}
} // props.vdwCombGeom
} // !props.vdwComb
+ // c6 and c12 are unused and garbage iff props.vdwCombLB && !doCalcEnergies
+ const float c6 = c6c12[0];
+ const float c12 = c6c12[1];
+
// Ensure distance do not become so small that r^-12 overflows
r2 = std::max(r2, c_nbnxnMinDistanceSquared);
#if GMX_SYCL_HIPSYCL