* \author Szilárd Páll <pall.szilard@gmail.com>
* \ingroup module_mdlib
*/
-#include "config.h"
-
#include <assert.h>
/* Note that floating-point constants in CUDA code should be suffixed
* with f (e.g. 0.5f), to stop the compiler producing intermediate
* code that is in double precision.
*/
-#include "config.h"
#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
#include "gromacs/gpu_utils/vectype_ops.cuh"
#include "nbnxn_cuda_types.h"
#ifndef NBNXN_CUDA_KERNEL_UTILS_CUH
#define NBNXN_CUDA_KERNEL_UTILS_CUH
-/* Use texture objects if supported by the target hardware. */
-#if GMX_PTX_ARCH >= 300
+/* Use texture objects if supported by the target hardware (and in host pass). */
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
/* Note: convenience macro, needs to be undef-ed at the end of the file. */
#define USE_TEXOBJ
#endif
static const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
/*! \brief Stride in the force accumualation buffer */
static const int c_fbufStride = c_clSizeSq;
+/*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
+static const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
static const float c_oneSixth = 0.16666667f;
static const float c_oneTwelveth = 0.08333333f;
-/* With multiple compilation units this ensures that texture refs are available
- in the the kernels' compilation units. */
-#if !GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
-/*! Texture reference for LJ C6/C12 parameters; bound to cu_nbparam_t.nbfp */
-extern texture<float, 1, cudaReadModeElementType> nbfp_texref;
-
-/*! Texture reference for LJ-PME parameters; bound to cu_nbparam_t.nbfp_comb */
-extern texture<float, 1, cudaReadModeElementType> nbfp_comb_texref;
-
-/*! Texture reference for Ewald coulomb force table; bound to cu_nbparam_t.coulomb_tab */
-extern texture<float, 1, cudaReadModeElementType> coulomb_tab_texref;
-#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
/*! Convert LJ sigma,epsilon parameters to C6,C12. */
static __forceinline__ __device__
/*! Apply potential switch, force-only version. */
static __forceinline__ __device__
void calculate_potential_switch_F(const cu_nbparam_t nbparam,
- float c6,
- float c12,
float inv_r,
float r2,
float *F_invr,
/*! Apply potential switch, force + energy version. */
static __forceinline__ __device__
void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
- float c6,
- float c12,
float inv_r,
float r2,
float *F_invr,
*E_lj *= sw;
}
+
+/*! \brief Fetch C6 grid contribution coefficients and return the product of these.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+float calculate_lj_ewald_c6grid(const cu_nbparam_t nbparam,
+ int typei,
+ int typej)
+{
+#if DISABLE_CUDA_TEXTURES
+ return LDG(&nbparam.nbfp_comb[2*typei]) * LDG(&nbparam.nbfp_comb[2*typej]);
+#else
+#ifdef USE_TEXOBJ
+ return tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
+#else
+ return tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
+#endif /* USE_TEXOBJ */
+#endif /* DISABLE_CUDA_TEXTURES */
+}
+
+
/*! Calculate LJ-PME grid force contribution with
* geometric combination rule.
*/
{
float c6grid, inv_r6_nm, cr2, expmcr2, poly;
-#ifdef USE_TEXOBJ
- c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
-#else
- c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
-#endif /* USE_TEXOBJ */
+ c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
/* Recalculate inv_r6 without exclusion mask */
inv_r6_nm = inv_r2*inv_r2*inv_r2;
*F_invr += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
}
+
/*! Calculate LJ-PME grid force + energy contribution with
* geometric combination rule.
*/
{
float c6grid, inv_r6_nm, cr2, expmcr2, poly, sh_mask;
-#ifdef USE_TEXOBJ
- c6grid = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej);
-#else
- c6grid = tex1Dfetch(nbfp_comb_texref, 2*typei) * tex1Dfetch(nbfp_comb_texref, 2*typej);
-#endif /* USE_TEXOBJ */
+ c6grid = calculate_lj_ewald_c6grid(nbparam, typei, typej);
/* Recalculate inv_r6 without exclusion mask */
inv_r6_nm = inv_r2*inv_r2*inv_r2;
*E_lj += c_oneSixth*c6grid*(inv_r6_nm*(1.0f - expmcr2*poly) + sh_mask);
}
+/*! Fetch per-type LJ parameters.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+float2 fetch_nbfp_comb_c6_c12(const cu_nbparam_t nbparam,
+ int type)
+{
+ float2 c6c12;
+#if DISABLE_CUDA_TEXTURES
+ /* Force an 8-byte fetch to save a memory instruction. */
+ float2 *nbfp_comb = (float2 *)nbparam.nbfp_comb;
+ c6c12 = LDG(&nbfp_comb[type]);
+#else
+ /* NOTE: as we always do 8-byte aligned loads, we could
+ fetch float2 here too just as above. */
+#ifdef USE_TEXOBJ
+ c6c12.x = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type);
+ c6c12.y = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*type + 1);
+#else
+ c6c12.x = tex1Dfetch(nbfp_comb_texref, 2*type);
+ c6c12.y = tex1Dfetch(nbfp_comb_texref, 2*type + 1);
+#endif /* USE_TEXOBJ */
+#endif /* DISABLE_CUDA_TEXTURES */
+
+ return c6c12;
+}
+
+
/*! Calculate LJ-PME grid force + energy contribution (if E_lj != NULL) with
* Lorentz-Berthelot combination rule.
* We use a single F+E kernel with conditional because the performance impact
float sigma, sigma2, epsilon;
/* sigma and epsilon are scaled to give 6*C6 */
-#ifdef USE_TEXOBJ
- sigma = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei ) + tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej );
- epsilon = tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typei + 1) * tex1Dfetch<float>(nbparam.nbfp_comb_texobj, 2*typej + 1);
-#else
- sigma = tex1Dfetch(nbfp_comb_texref, 2*typei ) + tex1Dfetch(nbfp_comb_texref, 2*typej );
- epsilon = tex1Dfetch(nbfp_comb_texref, 2*typei + 1) * tex1Dfetch(nbfp_comb_texref, 2*typej + 1);
-#endif /* USE_TEXOBJ */
+ float2 c6c12_i = fetch_nbfp_comb_c6_c12(nbparam, typei);
+ float2 c6c12_j = fetch_nbfp_comb_c6_c12(nbparam, typej);
+
+ sigma = c6c12_i.x + c6c12_j.x;
+ epsilon = c6c12_i.y * c6c12_j.y;
+
sigma2 = sigma*sigma;
c6grid = epsilon*sigma2*sigma2*sigma2;
}
}
-/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
- * Original idea: from the OpenMM project
+
+/*! Fetch two consecutive values from the Ewald correction F*r table.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
*/
static __forceinline__ __device__
-float interpolate_coulomb_force_r(float r, float scale)
+float2 fetch_coulomb_force_r(const cu_nbparam_t nbparam,
+ int index)
{
- float normalized = scale * r;
- int index = (int) normalized;
- float fract2 = normalized - index;
- float fract1 = 1.0f - fract2;
+ float2 d;
+
+#if DISABLE_CUDA_TEXTURES
+ /* Can't do 8-byte fetch because some of the addresses will be misaligned. */
+ d.x = LDG(&nbparam.coulomb_tab[index]);
+ d.y = LDG(&nbparam.coulomb_tab[index + 1]);
+#else
+#ifdef USE_TEXOBJ
+ d.x = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index);
+ d.y = tex1Dfetch<float>(nbparam.coulomb_tab_texobj, index + 1);
+#else
+ d.x = tex1Dfetch(coulomb_tab_texref, index);
+ d.y = tex1Dfetch(coulomb_tab_texref, index + 1);
+#endif // USE_TEXOBJ
+#endif // DISABLE_CUDA_TEXTURES
- return fract1 * tex1Dfetch(coulomb_tab_texref, index)
- + fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
+ return d;
}
+/*! Linear interpolation using exactly two FMA operations.
+ *
+ * Implements numeric equivalent of: (1-t)*d0 + t*d1
+ * Note that CUDA does not have fnms, otherwise we'd use
+ * fma(t, d1, fnms(t, d0, d0)
+ * but input modifiers are designed for this and are fast.
+ */
+template <typename T>
+__forceinline__ __host__ __device__
+T lerp(T d0, T d1, T t)
+{
+ return fma(t, d1, fma(-t, d0, d0));
+}
+
+/*! Interpolate Ewald coulomb force correction using the F*r table.
+ */
static __forceinline__ __device__
-float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
- float r, float scale)
+float interpolate_coulomb_force_r(const cu_nbparam_t nbparam,
+ float r)
{
- float normalized = scale * r;
- int index = (int) normalized;
- float fract2 = normalized - index;
- float fract1 = 1.0f - fract2;
+ float normalized = nbparam.coulomb_tab_scale * r;
+ int index = (int) normalized;
+ float fraction = normalized - index;
+
+ float2 d01 = fetch_coulomb_force_r(nbparam, index);
- return fract1 * tex1Dfetch<float>(texobj_coulomb_tab, index) +
- fract2 * tex1Dfetch<float>(texobj_coulomb_tab, index + 1);
+ return lerp(d01.x, d01.y, fraction);
}
+/*! Fetch C6 and C12 from the parameter table.
+ *
+ * Depending on what is supported, it fetches parameters either
+ * using direct load, texture objects, or texrefs.
+ */
+static __forceinline__ __device__
+void fetch_nbfp_c6_c12(float &c6,
+ float &c12,
+ const cu_nbparam_t nbparam,
+ int baseIndex)
+{
+#if DISABLE_CUDA_TEXTURES
+ /* Force an 8-byte fetch to save a memory instruction. */
+ float2 *nbfp = (float2 *)nbparam.nbfp;
+ float2 c6c12;
+ c6c12 = LDG(&nbfp[baseIndex]);
+ c6 = c6c12.x;
+ c12 = c6c12.y;
+#else
+ /* NOTE: as we always do 8-byte aligned loads, we could
+ fetch float2 here too just as above. */
+#ifdef USE_TEXOBJ
+ c6 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex);
+ c12 = tex1Dfetch<float>(nbparam.nbfp_texobj, 2*baseIndex + 1);
+#else
+ c6 = tex1Dfetch(nbfp_texref, 2*baseIndex);
+ c12 = tex1Dfetch(nbfp_texref, 2*baseIndex + 1);
+#endif
+#endif // DISABLE_CUDA_TEXTURES
+}
+
+
/*! Calculate analytical Ewald correction term. */
static __forceinline__ __device__
float pmecorrF(float z2)
/*! Final j-force reduction; this implementation only with power of two
* array sizes and with sm >= 3.0
*/
-#if GMX_PTX_ARCH >= 300
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
static __forceinline__ __device__
void reduce_force_j_warp_shfl(float3 f, float3 *fout,
int tidxi, int aidx,
/*! Final i-force reduction; this implementation works only with power of two
* array sizes and with sm >= 3.0
*/
-#if GMX_PTX_ARCH >= 300
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
static __forceinline__ __device__
void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
float *fshift_buf, bool bCalcFshift,
/*! Energy reduction; this implementation works only with power of two
* array sizes and with sm >= 3.0
*/
-#if GMX_PTX_ARCH >= 300
+#if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
static __forceinline__ __device__
void reduce_energy_warp_shfl(float E_lj, float E_el,
float *e_lj, float *e_el,