* the research papers on the package. Check out http://www.gromacs.org.
*/
+/*! \internal \file
+ * \brief OpenCL non-bonded kernel for AMD GPUs.
+ *
+ * OpenCL 1.2 support is expected.
+ *
+ * \author Anca Hamuraru <anca@streamcomputing.eu>
+ * \author Szilárd Páll <pall.szilard@gmail.com>
+ * \ingroup module_mdlib
+ */
+
+
#include "nbnxn_ocl_kernel_utils.clh"
/////////////////////////////////////////////////////////////////////////////////////////////////
#define LJ_EWALD
#endif
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_COMB
+#endif
+
/*
Kernel launch parameters:
- #blocks = #pair lists, blockId = pair list Id
- shmem = CL_SIZE^2 * sizeof(float)
Each thread calculates an i force-component taking one pair of i-j atoms.
+
+ TODO: implement 128 threads/wavefront by porting over the NTHREAD_Z/j4 loop
+ "horizontal splitting" over threads.
*/
-//#if __CUDA_ARCH__ >= 350
-//__launch_bounds__(64, 16)
-//#endif
+
/* NOTE:
NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
__kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
#endif
#endif
-(int ntypes, /* IN */
+(
+#ifndef LJ_COMB
+ int ntypes, /* IN */
+#endif
cl_nbparam_params_t nbparam_params, /* IN */
const __global float4 *restrict xq, /* IN */
__global float *restrict f, /* stores float3 values */ /* OUT */
__global float *restrict e_lj, /* OUT */
__global float *restrict e_el, /* OUT */
__global float *restrict fshift, /* stores float3 values */ /* OUT */
+#ifdef LJ_COMB
+ const __global float2 *restrict lj_comb, /* stores float2 values */ /* IN */
+#else
const __global int *restrict atom_types, /* IN */
+#endif
const __global float *restrict shift_vec, /* stores float3 values */ /* IN */
__constant float* nbfp_climg2d, /* IN */
__constant float* nbfp_comb_climg2d, /* IN */
cl_nbparam_params_t *nbparam = &nbparam_params;
float rcoulomb_sq = nbparam->rcoulomb_sq;
-
+#ifdef LJ_COMB
+ float2 ljcp_i, ljcp_j;
+#endif
#ifdef VDW_CUTOFF_CHECK
- float rvdw_sq = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+ float rvdw_sq = nbparam_params.rvdw_sq;
float vdw_in_range;
#endif
#ifdef LJ_EWALD
unsigned int widx = tidx / WARP_SIZE; /* warp index */
int sci, ci, cj, ci_offset,
ai, aj,
- cij4_start, cij4_end,
- typei, typej,
- i, jm, j4, wexcl_idx;
+ cij4_start, cij4_end;
+#ifndef LJ_COMB
+ int typei, typej;
+#endif
+ int i, jm, j4, wexcl_idx;
float qi, qj_f,
- r2, inv_r, inv_r2, inv_r6,
- c6, c12,
- int_bit,
+ r2, inv_r, inv_r2;
+#if !defined LJ_COMB_LB || defined CALC_ENERGIES
+ float inv_r6, c6, c12;
+#endif
+#if defined LJ_COMB_LB
+ float sigma, epsilon;
+#endif
+ float int_bit,
F_invr;
#ifdef CALC_ENERGIES
float3 fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
nbnxn_sci_t nb_sci;
+ /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
+ const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
+
/* shmem buffer for cj, for both warps separately */
- __local int *cjs = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+ __local int *cjs = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
#define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
-#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+#ifdef IATYPE_SHMEM
+#ifndef LJ_COMB
/* shmem buffer for i atom-type pre-loading */
- __local int *atib = (__local int *)(LOCAL_OFFSET);
+ __local int *atib = (__local int *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
#define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#else
+ __local float2 *ljcpib = (__local float2 *)(LOCAL_OFFSET);
+ #undef LOCAL_OFFSET
+ #define LOCAL_OFFSET ljcpib + NCL_PER_SUPERCL * CL_SIZE
+#endif
#endif
#ifndef REDUCE_SHUFFLE
/* shmem j force buffer */
- __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+ __local float *f_buf = (__local float *)(LOCAL_OFFSET);
#undef LOCAL_OFFSET
#define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
#endif
ci = sci * NCL_PER_SUPERCL + tidxj;
ai = ci * CL_SIZE + tidxi;
- xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+ xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+ xqbuf.w *= nbparam->epsfac;
+ xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
-#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+#ifdef IATYPE_SHMEM
+#ifndef LJ_COMB
/* Pre-load the i-atom types into shared memory */
- atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+ atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+#else
+ ljcpib[tidxj * CL_SIZE + tidxi] = lj_comb[ai];
+#endif
#endif
/* Initialise warp vote. (8x8 block) 2 warps for nvidia */
if(tidx==0 || tidx==32)
#endif /* LJ_EWALD */
#if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
- E_el /= CL_SIZE;
+ /* Correct for epsfac^2 due to adding qi^2 */
+ E_el /= nbparam->epsfac*CL_SIZE;
#if defined EL_RF || defined EL_CUTOFF
- E_el *= -nbparam->epsfac*0.5f*c_rf;
+ E_el *= -0.5f*c_rf;
#else
- E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+ E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
#endif
-#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
- }
-#endif /* EXCLUSION_FORCES */
-
-#endif /* CALC_ENERGIES */
-
- /* skip central shifts when summing shift forces */
- if (nb_sci.shift == CENTRAL)
- {
- bCalcFshift = false;
+#endif /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
}
+#endif /* EXCLUSION_FORCES */
- fshift_buf = 0.0f;
+#endif /* CALC_ENERGIES */
/* loop over the j clusters = seen by any of the atoms in the current super-cluster */
for (j4 = cij4_start; j4 < cij4_end; j4++)
cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
}
- /* Unrolling this loop
- - with pruning leads to register spilling;
- - on Kepler is much slower;
- - doesn't work on CUDA <v4.1
- Tested with nvcc 3.2 - 5.0.7 */
-#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
-//#pragma unroll 4
+ /* Unrolling this loop improves performance without pruning but
+ * with pruning it leads to slowdown.
+ *
+ * Tested with driver 1800.5
+ */
+#if !defined PRUNE_NBL
+#pragma unroll 4
#endif
for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
{
- if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+ if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
{
mask_ji = (1U << (jm * NCL_PER_SUPERCL));
/* load j atom data */
xqbuf = xq[aj];
xj = (float3)(xqbuf.xyz);
- qj_f = nbparam->epsfac * xqbuf.w;
+ qj_f = xqbuf.w;
+#ifndef LJ_COMB
typej = atom_types[aj];
+#else
+ ljcp_j = lj_comb[aj];
+#endif
fcj_buf = (float3)(0.0f);
- /* The PME and RF kernels don't unroll with CUDA <v4.1. */
-#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
-//#pragma unroll 8
+#if !defined PRUNE_NBL
+#pragma unroll 8
#endif
for (i = 0; i < NCL_PER_SUPERCL; i++)
{
imask &= ~mask_ji;
warp_any[widx]=0;
-
#endif
int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
{
/* load the rest of the i-atom parameters */
qi = xqbuf.w;
-#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+#ifdef IATYPE_SHMEM
+#ifndef LJ_COMB
typei = atib[i * CL_SIZE + tidxi];
#else
+ ljcp_i = ljcpib[i * CL_SIZE + tidxi];
+#endif
+#else /* IATYPE_SHMEM */
+#ifndef LJ_COMB
typei = atom_types[ai];
+#else
+ ljcp_i = lj_comb[ai];
#endif
+#endif /* IATYPE_SHMEM */
/* LJ 6*C6 and 12*C12 */
+#ifndef LJ_COMB
c6 = nbfp_climg2d[2 * (ntypes * typei + typej)];
c12 = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+#else /* LJ_COMB */
+#ifdef LJ_COMB_GEOM
+ c6 = ljcp_i.x * ljcp_j.x;
+ c12 = ljcp_i.y * ljcp_j.y;
+#else
+ /* LJ 2^(1/6)*sigma and 12*epsilon */
+ sigma = ljcp_i.x + ljcp_j.x;
+ epsilon = ljcp_i.y * ljcp_j.y;
+#if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+ convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
+#endif
+#endif /* LJ_COMB_GEOM */
+#endif /* LJ_COMB */
- /* avoid NaN for excluded pairs at r=0 */
- r2 += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+ // Ensure distance do not become so small that r^-12 overflows
+ r2 = max(r2,NBNXN_MIN_RSQ);
inv_r = rsqrt(r2);
inv_r2 = inv_r * inv_r;
+#if !defined LJ_COMB_LB || defined CALC_ENERGIES
inv_r6 = inv_r2 * inv_r2 * inv_r2;
#if defined EXCLUSION_FORCES
/* We could mask inv_r2, but with Ewald
c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
#endif
+#else /* ! LJ_COMB_LB || CALC_ENERGIES */
+ float sig_r = sigma*inv_r;
+ float sig_r2 = sig_r*sig_r;
+ float sig_r6 = sig_r2*sig_r2*sig_r2;
+#if defined EXCLUSION_FORCES
+ sig_r6 *= int_bit;
+#endif /* EXCLUSION_FORCES */
+
+ F_invr = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+#endif /* ! LJ_COMB_LB || CALC_ENERGIES */
#ifdef LJ_FORCE_SWITCH
}
}
+ /* skip central shifts when summing shift forces */
+ if (nb_sci.shift == CENTRAL)
+ {
+ bCalcFshift = false;
+ }
+
+ fshift_buf = 0.0f;
+
/* reduce i forces */
for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
{
barrier(CLK_LOCAL_MEM_FENCE);
}
- /* add up local shift forces into global mem */
- //if (bCalcFshift && tidxj == 0)
- // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+ /* add up local shift forces into global mem */
if (bCalcFshift)
- {
+ {
/* Only threads with tidxj < 3 will update fshift.
The threads performing the update must be the same with the threads
which stored the reduction result in reduce_force_i function
#undef EL_EWALD_ANY
#undef EXCLUSION_FORCES
#undef LJ_EWALD
+
+#undef LJ_COMB