* To help us fund GROMACS development, we humbly ask that you cite
* the research papers on the package. Check out http://www.gromacs.org.
*/
+#ifndef CUDA_ARCH_UTILS_CUH_
+#define CUDA_ARCH_UTILS_CUH_
/*! \file
* \brief CUDA arch dependent definitions.
* \author Szilard Pall <pall.szilard@gmail.com>
*/
-/* TODO move here other CUDA arch-related defines like WARP_SIZE */
-
/* GMX_PTX_ARCH is set to the virtual arch (PTX) version targeted by
* the current compiler pass or zero for the host pass and it is
* intended to be used instead of __CUDA_ARCH__.
#else
#define GMX_PTX_ARCH __CUDA_ARCH__
#endif
+
+/* Until CC 5.2 and likely for the near future all NVIDIA architectures
+ have a warp size of 32, but this could change later. If it does, the
+ following constants should depend on the value of GMX_PTX_ARCH.
+ */
+static const int warp_size = 32;
+static const int warp_size_log2 = 5;
+
+#endif /* CUDA_ARCH_UTILS_CUH_ */
#include "gromacs/utility/fatalerror.h"
-/* CUDA library and hardware related defines */
-/* TODO list some constants instead that can be used for consistency checks to
- detect future devices with features that make the currect code incompatible
- with them (e.g. expected warp size = 32, check against the dev_info->props.warpsize). */
-#define WARP_SIZE 32
-
/* TODO error checking needs to be rewritten. We have 2 types of error checks needed
based on where they occur in the code:
- non performance-critical: these errors are unsafe to be ignored and must be
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2015, by the GROMACS development team, led by
+ * Copyright (c) 2012,2015,2016, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#define VECTYPE_OPS_CUH
/**** float3 ****/
-inline __host__ __device__ float3 make_float3(float s)
+__forceinline__ __host__ __device__ float3 make_float3(float s)
{
return make_float3(s, s, s);
}
-inline __host__ __device__ float3 make_float3(float4 a)
+__forceinline__ __host__ __device__ float3 make_float3(float4 a)
{
return make_float3(a.x, a.y, a.z);
}
-inline __host__ __device__ float3 operator-(float3 &a)
+__forceinline__ __host__ __device__ float3 operator-(float3 &a)
{
return make_float3(-a.x, -a.y, -a.z);
}
-inline __host__ __device__ float3 operator+(float3 a, float3 b)
+__forceinline__ __host__ __device__ float3 operator+(float3 a, float3 b)
{
return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
}
-inline __host__ __device__ float3 operator-(float3 a, float3 b)
+__forceinline__ __host__ __device__ float3 operator-(float3 a, float3 b)
{
return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
}
-inline __host__ __device__ float3 operator*(float3 a, float k)
+__forceinline__ __host__ __device__ float3 operator*(float3 a, float k)
{
return make_float3(k * a.x, k * a.y, k * a.z);
}
-inline __host__ __device__ float3 operator*(float k, float3 a)
+__forceinline__ __host__ __device__ float3 operator*(float k, float3 a)
{
return make_float3(k * a.x, k * a.y, k * a.z);
}
-inline __host__ __device__ void operator += (float3 &a, float3 b)
+__forceinline__ __host__ __device__ void operator += (float3 &a, float3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
-inline __host__ __device__ void operator += (float3 &a, float4 b)
+__forceinline__ __host__ __device__ void operator += (float3 &a, float4 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
-inline __host__ __device__ void operator -= (float3 &a, float3 b)
+__forceinline__ __host__ __device__ void operator -= (float3 &a, float3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
-inline __host__ __device__ float norm(float3 a)
+__forceinline__ __host__ __device__ float norm(float3 a)
{
return sqrt(a.x * a.x + a.y * a.y + a.z * a.z);
}
-inline __host__ __device__ float norm2(float3 a)
+__forceinline__ __host__ __device__ float norm2(float3 a)
{
return (a.x * a.x + a.y * a.y + a.z * a.z);
}
-inline __host__ __device__ float dist3(float3 a, float3 b)
+__forceinline__ __host__ __device__ float dist3(float3 a, float3 b)
{
return norm(b - a);
}
-inline __host__ __device__ float3 operator*(float3 a, float3 b)
+__forceinline__ __host__ __device__ float3 operator*(float3 a, float3 b)
{
return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
}
-inline __host__ __device__ void operator *= (float3 &a, float3 b)
+__forceinline__ __host__ __device__ void operator *= (float3 &a, float3 b)
{
a.x *= b.x; a.y *= b.y; a.z *= b.z;
}
-inline __device__ void atomicAdd(float3 *addr, float3 val)
+__forceinline__ __device__ void atomicAdd(float3 *addr, float3 val)
{
atomicAdd(&addr->x, val.x);
atomicAdd(&addr->y, val.y);
/****************************************************************/
/**** float4 ****/
-inline __host__ __device__ float4 make_float4(float s)
+__forceinline__ __host__ __device__ float4 make_float4(float s)
{
return make_float4(s, s, s, s);
}
-inline __host__ __device__ float4 make_float4(float3 a)
+__forceinline__ __host__ __device__ float4 make_float4(float3 a)
{
return make_float4(a.x, a.y, a.z, 0.0f);
}
-inline __host__ __device__ float4 operator+(float4 a, float4 b)
+__forceinline__ __host__ __device__ float4 operator+(float4 a, float4 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
}
-inline __host__ __device__ float4 operator+(float4 a, float3 b)
+__forceinline__ __host__ __device__ float4 operator+(float4 a, float3 b)
{
return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w);
}
-inline __host__ __device__ float4 operator-(float4 a, float4 b)
+__forceinline__ __host__ __device__ float4 operator-(float4 a, float4 b)
{
return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
}
-inline __host__ __device__ float4 operator*(float4 a, float k)
+__forceinline__ __host__ __device__ float4 operator*(float4 a, float k)
{
return make_float4(k * a.x, k * a.y, k * a.z, k * a.w);
}
-inline __host__ __device__ void operator += (float4 &a, float4 b)
+__forceinline__ __host__ __device__ void operator += (float4 &a, float4 b)
{
a.x += b.x; a.y += b.y; a.z += b.z; a.w += b.w;
}
-inline __host__ __device__ void operator += (float4 &a, float3 b)
+__forceinline__ __host__ __device__ void operator += (float4 &a, float3 b)
{
a.x += b.x; a.y += b.y; a.z += b.z;
}
-inline __host__ __device__ void operator -= (float4 &a, float3 b)
+__forceinline__ __host__ __device__ void operator -= (float4 &a, float3 b)
{
a.x -= b.x; a.y -= b.y; a.z -= b.z;
}
-inline __host__ __device__ float norm(float4 a)
+__forceinline__ __host__ __device__ float norm(float4 a)
{
return sqrt(a.x * a.x + a.y * a.y + a.z * a.z + a.w * a.w);
}
-inline __host__ __device__ float dist3(float4 a, float4 b)
+__forceinline__ __host__ __device__ float dist3(float4 a, float4 b)
{
return norm(b - a);
}
* - CC 3.7: NTHREAD_Z=2, (128, 16) bounds
*/
#if GMX_PTX_ARCH == 370
-#define NTHREAD_Z (2)
-#define MIN_BLOCKS_PER_MP (16)
+ #define NTHREAD_Z (2)
+ #define MIN_BLOCKS_PER_MP (16)
#else
-#define NTHREAD_Z (1)
-#define MIN_BLOCKS_PER_MP (16)
-#endif
+ #define NTHREAD_Z (1)
+ #define MIN_BLOCKS_PER_MP (16)
+#endif /* GMX_PTX_ARCH == 370 */
#define THREADS_PER_BLOCK (CL_SIZE*CL_SIZE*NTHREAD_Z)
#if GMX_PTX_ARCH >= 350
#if (GMX_PTX_ARCH <= 210) && (NTHREAD_Z > 1)
-#error NTHREAD_Z > 1 will give incorrect results on CC 2.x
+ #error NTHREAD_Z > 1 will give incorrect results on CC 2.x
#endif
/**@}*/
__launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP)
#else
__launch_bounds__(THREADS_PER_BLOCK)
-#endif
+#endif /* GMX_PTX_ARCH >= 350 */
#ifdef PRUNE_NBL
#ifdef CALC_ENERGIES
__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_cuda)
#else
__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_prune_cuda)
-#endif
+#endif /* CALC_ENERGIES */
#else
#ifdef CALC_ENERGIES
__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_cuda)
#else
__global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
-#endif
-#endif
+#endif /* CALC_ENERGIES */
+#endif /* PRUNE_NBL */
(const cu_atomdata_t atdat,
const cu_nbparam_t nbparam,
const cu_plist_t plist,
unsigned int tidxz = threadIdx.z;
#endif
unsigned int bidx = blockIdx.x;
- unsigned int widx = tidx / WARP_SIZE; /* warp index */
+ unsigned int widx = tidx / warp_size; /* warp index */
int sci, ci, cj, ci_offset,
ai, aj,
{
wexcl_idx = pl_cj4[j4].imei[widx].excl_ind;
imask = pl_cj4[j4].imei[widx].imask;
- wexcl = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
+ wexcl = excl[wexcl_idx].pair[(tidx) & (warp_size - 1)];
#ifndef PRUNE_NBL
if (imask)
/* flush the energies to shmem and reduce them */
f_buf[ tidx] = E_lj;
f_buf[FBUF_STRIDE + tidx] = E_el;
- reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+ reduce_energy_pow2(f_buf + (tidx & warp_size), e_lj, e_el, tidx & ~warp_size);
#endif
#endif
}
#define USE_TEXOBJ
#endif
-#define WARP_SIZE_POW2_EXPONENT (5)
-#define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
-#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
-#define FBUF_STRIDE (CL_SIZE_SQ)
+#define CL_SIZE_LOG2 (3) /* change this together with CL_SIZE !*/
+#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE (CL_SIZE_SQ)
#define ONE_SIXTH_F 0.16666667f
#define ONE_TWELVETH_F 0.08333333f
#endif /* GMX_CUDA_NB_SINGLE_COMPILATION_UNIT */
/*! Apply force switch, force + energy version. */
-static inline __device__
+static __forceinline__ __device__
void calculate_force_switch_F(const cu_nbparam_t nbparam,
float c6,
float c12,
}
/*! Apply force switch, force-only version. */
-static inline __device__
+static __forceinline__ __device__
void calculate_force_switch_F_E(const cu_nbparam_t nbparam,
float c6,
float c12,
}
/*! Apply potential switch, force-only version. */
-static inline __device__
+static __forceinline__ __device__
void calculate_potential_switch_F(const cu_nbparam_t nbparam,
float c6,
float c12,
}
/*! Apply potential switch, force + energy version. */
-static inline __device__
+static __forceinline__ __device__
void calculate_potential_switch_F_E(const cu_nbparam_t nbparam,
float c6,
float c12,
/*! Calculate LJ-PME grid force contribution with
* geometric combination rule.
*/
-static inline __device__
+static __forceinline__ __device__
void calculate_lj_ewald_comb_geom_F(const cu_nbparam_t nbparam,
int typei,
int typej,
/*! Calculate LJ-PME grid force + energy contribution with
* geometric combination rule.
*/
-static inline __device__
+static __forceinline__ __device__
void calculate_lj_ewald_comb_geom_F_E(const cu_nbparam_t nbparam,
int typei,
int typej,
* We use a single F+E kernel with conditional because the performance impact
* of this is pretty small and LB on the CPU is anyway very slow.
*/
-static inline __device__
+static __forceinline__ __device__
void calculate_lj_ewald_comb_LB_F_E(const cu_nbparam_t nbparam,
int typei,
int typej,
/*! Interpolate Ewald coulomb force using the table through the tex_nbfp texture.
* Original idea: from the OpenMM project
*/
-static inline __device__
+static __forceinline__ __device__
float interpolate_coulomb_force_r(float r, float scale)
{
float normalized = scale * r;
+ fract2 * tex1Dfetch(coulomb_tab_texref, index + 1);
}
-static inline __device__
+static __forceinline__ __device__
float interpolate_coulomb_force_r(cudaTextureObject_t texobj_coulomb_tab,
float r, float scale)
{
}
/*! Calculate analytical Ewald correction term. */
-static inline __device__
+static __forceinline__ __device__
float pmecorrF(float z2)
{
const float FN6 = -1.7357322914161492954e-8f;
/*! Final j-force reduction; this generic implementation works with
* arbitrary array sizes.
*/
-static inline __device__
+static __forceinline__ __device__
void reduce_force_j_generic(float *f_buf, float3 *fout,
int tidxi, int tidxj, int aidx)
{
* array sizes and with sm >= 3.0
*/
#if GMX_PTX_ARCH >= 300
-static inline __device__
+static __forceinline__ __device__
void reduce_force_j_warp_shfl(float3 f, float3 *fout,
int tidxi, int aidx)
{
* arbitrary array sizes.
* TODO: add the tidxi < 3 trick
*/
-static inline __device__
+static __forceinline__ __device__
void reduce_force_i_generic(float *f_buf, float3 *fout,
float *fshift_buf, bool bCalcFshift,
int tidxi, int tidxj, int aidx)
/*! Final i-force reduction; this implementation works only with power of two
* array sizes.
*/
-static inline __device__
+static __forceinline__ __device__
void reduce_force_i_pow2(volatile float *f_buf, float3 *fout,
float *fshift_buf, bool bCalcFshift,
int tidxi, int tidxj, int aidx)
* Can't just use i as loop variable because than nvcc refuses to unroll.
*/
i = CL_SIZE/2;
- # pragma unroll 5
- for (j = CL_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+#pragma unroll 5
+ for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
{
if (tidxj < i)
{
/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
* on whether the size of the array to be reduced is power of two or not.
*/
-static inline __device__
+static __forceinline__ __device__
void reduce_force_i(float *f_buf, float3 *f,
float *fshift_buf, bool bCalcFshift,
int tidxi, int tidxj, int ai)
* array sizes and with sm >= 3.0
*/
#if GMX_PTX_ARCH >= 300
-static inline __device__
+static __forceinline__ __device__
void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
float *fshift_buf, bool bCalcFshift,
int tidxj, int aidx)
/*! Energy reduction; this implementation works only with power of two
* array sizes.
*/
-static inline __device__
+static __forceinline__ __device__
void reduce_energy_pow2(volatile float *buf,
float *e_lj, float *e_el,
unsigned int tidx)
int i, j;
float e1, e2;
- i = WARP_SIZE/2;
+ i = warp_size/2;
/* Can't just use i as loop variable because than nvcc refuses to unroll. */
-# pragma unroll 10
- for (j = WARP_SIZE_POW2_EXPONENT - 1; j > 0; j--)
+#pragma unroll 10
+ for (j = warp_size_log2 - 1; j > 0; j--)
{
if (tidx < i)
{
* array sizes and with sm >= 3.0
*/
#if GMX_PTX_ARCH >= 300
-static inline __device__
+static __forceinline__ __device__
void reduce_energy_warp_shfl(float E_lj, float E_el,
float *e_lj, float *e_el,
int tidx)
}
/* The first thread in the warp writes the reduced energies */
- if (tidx == 0 || tidx == WARP_SIZE)
+ if (tidx == 0 || tidx == warp_size)
{
atomicAdd(e_lj, E_lj);
atomicAdd(e_el, E_el);
#define __device__
-#define WARP_SIZE_POW2_EXPONENT (5)
-#define CL_SIZE_POW2_EXPONENT (3) /* change this together with GPU_NS_CLUSTER_SIZE !*/
-#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
-#define FBUF_STRIDE (CL_SIZE_SQ)
+#define WARP_SIZE_LOG2 (5)
+#define CL_SIZE_LOG2 (3) /* change this together with CL_SIZE !*/
+#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
+#define FBUF_STRIDE (CL_SIZE_SQ)
#define ONE_SIXTH_F 0.16666667f
#define ONE_TWELVETH_F 0.08333333f