Merge branch release-5-1 into release-2016
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_ocl / nbnxn_ocl_kernel_amd.clh
index b18aa9c18ac8b653a6f6c64b5531a2a45f2bd52d..6c83d98adf00b7c93225c8cf75f33fed33d9da25 100644 (file)
  * the research papers on the package. Check out http://www.gromacs.org.
  */
 
+/*! \internal \file
+ *  \brief OpenCL non-bonded kernel for AMD GPUs.
+ *
+ *  OpenCL 1.2 support is expected.
+ *
+ *  \author Anca Hamuraru <anca@streamcomputing.eu>
+ *  \author Szilárd Páll <pall.szilard@gmail.com>
+ *  \ingroup module_mdlib
+ */
+
+
 #include "nbnxn_ocl_kernel_utils.clh"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 #define LJ_EWALD
 #endif
 
+#if defined LJ_COMB_GEOM || defined LJ_COMB_LB
+/* Note: convenience macro, needs to be undef-ed at the end of the file. */
+#define LJ_COMB
+#endif
+
 /*
    Kernel launch parameters:
     - #blocks   = #pair lists, blockId = pair list Id
     - shmem     = CL_SIZE^2 * sizeof(float)
 
     Each thread calculates an i force-component taking one pair of i-j atoms.
+
+  TODO: implement 128 threads/wavefront by porting over the NTHREAD_Z/j4 loop
+  "horizontal splitting" over threads.
  */
-//#if __CUDA_ARCH__ >= 350
-//__launch_bounds__(64, 16)
-//#endif
+
 /* NOTE:
  NB_KERNEL_FUNC_NAME differs from the CUDA equivalent as it is not a variadic macro due to OpenCL not having a support for them, this version only takes exactly 2 arguments.
  Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
@@ -86,14 +103,21 @@ __attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
         __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
     #endif
 #endif
-(int ntypes,                                                               /* IN  */
+(
+#ifndef LJ_COMB
+ int ntypes,                                                               /* IN  */
+#endif
  cl_nbparam_params_t nbparam_params,                                       /* IN  */
  const __global float4 *restrict xq,                                       /* IN  */
  __global float *restrict f,                /* stores float3 values */     /* OUT */
  __global float *restrict e_lj,                                            /* OUT */
  __global float *restrict e_el,                                            /* OUT */
 __global float *restrict fshift,            /* stores float3 values */     /* OUT */
+#ifdef LJ_COMB
+ const __global float2 *restrict lj_comb,      /* stores float2 values */  /* IN  */
+#else
  const __global int *restrict atom_types,                                  /* IN  */
+#endif
  const __global float *restrict shift_vec,  /* stores float3 values */     /* IN  */
  __constant float* nbfp_climg2d,                                           /* IN  */
  __constant float* nbfp_comb_climg2d,                                      /* IN  */
@@ -113,9 +137,11 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
     cl_nbparam_params_t *nbparam = &nbparam_params;
 
     float               rcoulomb_sq = nbparam->rcoulomb_sq;
-
+#ifdef LJ_COMB
+    float2              ljcp_i, ljcp_j;
+#endif
 #ifdef VDW_CUTOFF_CHECK
-    float               rvdw_sq     = nbparam_params.rvdw_sq;//nbparam->rvdw_sq;
+    float               rvdw_sq     = nbparam_params.rvdw_sq;
     float               vdw_in_range;
 #endif
 #ifdef LJ_EWALD
@@ -152,13 +178,20 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
     unsigned int widx   = tidx / WARP_SIZE; /* warp index */
     int          sci, ci, cj, ci_offset,
                  ai, aj,
-                 cij4_start, cij4_end,
-                 typei, typej,
-                 i, jm, j4, wexcl_idx;
+                 cij4_start, cij4_end;
+#ifndef LJ_COMB
+    int          typei, typej;
+#endif
+    int          i, jm, j4, wexcl_idx;
     float        qi, qj_f,
-                 r2, inv_r, inv_r2, inv_r6,
-                 c6, c12,
-                 int_bit,
+                 r2, inv_r, inv_r2;
+#if !defined LJ_COMB_LB || defined CALC_ENERGIES
+    float        inv_r6, c6, c12;
+#endif
+#if defined LJ_COMB_LB
+    float        sigma, epsilon;
+#endif
+    float        int_bit,
                  F_invr;
 
 #ifdef CALC_ENERGIES
@@ -174,20 +207,29 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
     float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
     nbnxn_sci_t  nb_sci;
 
+    /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
+    const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
+
     /* shmem buffer for cj, for both warps separately */
-    __local int *cjs     = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
+    __local int *cjs       = (__local int *)(xqib + NCL_PER_SUPERCL * CL_SIZE);
     #define LOCAL_OFFSET cjs + 2 * NBNXN_GPU_JGROUP_SIZE
 
-#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+#ifdef IATYPE_SHMEM
+#ifndef LJ_COMB
     /* shmem buffer for i atom-type pre-loading */
-    __local int *atib = (__local int *)(LOCAL_OFFSET);
+    __local int *atib      = (__local int *)(LOCAL_OFFSET);
     #undef LOCAL_OFFSET
     #define LOCAL_OFFSET atib + NCL_PER_SUPERCL * CL_SIZE
+#else
+    __local float2 *ljcpib = (__local float2 *)(LOCAL_OFFSET);
+    #undef LOCAL_OFFSET
+    #define LOCAL_OFFSET ljcpib + NCL_PER_SUPERCL * CL_SIZE
+#endif
 #endif
 
 #ifndef REDUCE_SHUFFLE
     /* shmem j force buffer */
-    __local float *f_buf = (__local float *)(LOCAL_OFFSET);
+    __local float *f_buf   = (__local float *)(LOCAL_OFFSET);
     #undef LOCAL_OFFSET
     #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
 #endif
@@ -205,11 +247,17 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
     ci = sci * NCL_PER_SUPERCL + tidxj;
     ai = ci * CL_SIZE + tidxi;
 
-    xqib[tidxj * CL_SIZE + tidxi] = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+    xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+    xqbuf.w *= nbparam->epsfac;
+    xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
 
-#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+#ifdef IATYPE_SHMEM
+#ifndef LJ_COMB
     /* Pre-load the i-atom types into shared memory */
-    atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+    atib[tidxj * CL_SIZE + tidxi]   = atom_types[ai];
+#else
+    ljcpib[tidxj * CL_SIZE + tidxi] = lj_comb[ai];
+#endif
 #endif
     /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
     if(tidx==0 || tidx==32)
@@ -255,25 +303,18 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 #endif  /* LJ_EWALD */
 
 #if defined EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF
-        E_el /= CL_SIZE;
+        /* Correct for epsfac^2 due to adding qi^2 */
+        E_el /= nbparam->epsfac*CL_SIZE;
 #if defined EL_RF || defined EL_CUTOFF
-        E_el *= -nbparam->epsfac*0.5f*c_rf;
+        E_el *= -0.5f*c_rf;
 #else
-        E_el *= -nbparam->epsfac*beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
+        E_el *= -beta*M_FLOAT_1_SQRTPI; /* last factor 1/sqrt(pi) */
 #endif
-#endif                                                 /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
-    }
-#endif                                                 /* EXCLUSION_FORCES */
-
-#endif                                                 /* CALC_ENERGIES */
-
-    /* skip central shifts when summing shift forces */
-    if (nb_sci.shift == CENTRAL)
-    {
-        bCalcFshift = false;
+#endif                                  /* EL_EWALD_ANY || defined EL_RF || defined EL_CUTOFF */
     }
+#endif                                  /* EXCLUSION_FORCES */
 
-    fshift_buf = 0.0f;
+#endif                                  /* CALC_ENERGIES */
 
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
     for (j4 = cij4_start; j4 < cij4_end; j4++)
@@ -292,18 +333,18 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                 cjs[tidxi + tidxj * NBNXN_GPU_JGROUP_SIZE / 4] = pl_cj4[j4].cj[tidxi];
             }
 
-            /* Unrolling this loop
-               - with pruning leads to register spilling;
-               - on Kepler is much slower;
-               - doesn't work on CUDA <v4.1
-               Tested with nvcc 3.2 - 5.0.7 */
-#if !defined PRUNE_NBL //&& __CUDA_ARCH__ < 300 && CUDA_VERSION >= 4010
-//#pragma unroll 4
+            /* Unrolling this loop improves performance without pruning but
+             * with pruning it leads to slowdown.
+             *
+             * Tested with driver 1800.5
+             */
+#if !defined PRUNE_NBL
+#pragma unroll 4
 #endif
 
             for (jm = 0; jm < NBNXN_GPU_JGROUP_SIZE; jm++)
             {
-                if (imask & (supercl_interaction_mask << (jm * NCL_PER_SUPERCL)))
+                if (imask & (superClInteractionMask << (jm * NCL_PER_SUPERCL)))
                 {
                     mask_ji = (1U << (jm * NCL_PER_SUPERCL));
 
@@ -313,14 +354,17 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                     /* load j atom data */
                     xqbuf   = xq[aj];
                     xj      = (float3)(xqbuf.xyz);
-                    qj_f    = nbparam->epsfac * xqbuf.w;
+                    qj_f    = xqbuf.w;
+#ifndef LJ_COMB
                     typej   = atom_types[aj];
+#else
+                    ljcp_j  = lj_comb[aj];
+#endif
 
                     fcj_buf = (float3)(0.0f);
 
-                    /* The PME and RF kernels don't unroll with CUDA <v4.1. */
-#if !defined PRUNE_NBL //&& !(CUDA_VERSION < 4010 && defined EXCLUSION_FORCES)
-//#pragma unroll 8
+#if !defined PRUNE_NBL
+#pragma unroll 8
 #endif
                     for (i = 0; i < NCL_PER_SUPERCL; i++)
                     {
@@ -351,7 +395,6 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                                 imask &= ~mask_ji;
 
                             warp_any[widx]=0;
-
 #endif
 
                             int_bit = (wexcl & mask_ji) ? 1.0f : 0.0f;
@@ -366,20 +409,43 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                             {
                                 /* load the rest of the i-atom parameters */
                                 qi      = xqbuf.w;
-#ifdef IATYPE_SHMEM //Should not be defined! CUDA > 300
+#ifdef IATYPE_SHMEM
+#ifndef LJ_COMB
                                 typei   = atib[i * CL_SIZE + tidxi];
 #else
+                                ljcp_i  = ljcpib[i * CL_SIZE + tidxi];
+#endif
+#else /* IATYPE_SHMEM */
+#ifndef LJ_COMB
                                 typei   = atom_types[ai];
+#else
+                                ljcp_i  = lj_comb[ai];
 #endif
+#endif /* IATYPE_SHMEM */
                                 /* LJ 6*C6 and 12*C12 */
+#ifndef LJ_COMB
                                 c6      = nbfp_climg2d[2 * (ntypes * typei + typej)];
                                 c12     = nbfp_climg2d[2 * (ntypes * typei + typej)+1];
+#else   /* LJ_COMB */
+#ifdef LJ_COMB_GEOM
+                                c6      = ljcp_i.x * ljcp_j.x;
+                                c12     = ljcp_i.y * ljcp_j.y;
+#else
+                                /* LJ 2^(1/6)*sigma and 12*epsilon */
+                                sigma   = ljcp_i.x + ljcp_j.x;
+                                epsilon = ljcp_i.y * ljcp_j.y;
+#if defined CALC_ENERGIES || defined LJ_FORCE_SWITCH || defined LJ_POT_SWITCH
+                                convert_sigma_epsilon_to_c6_c12(sigma, epsilon, &c6, &c12);
+#endif
+#endif                          /* LJ_COMB_GEOM */
+#endif                          /* LJ_COMB */
 
-                                /* avoid NaN for excluded pairs at r=0 */
-                                r2      += (1.0f - int_bit) * NBNXN_AVOID_SING_R2_INC;
+                                // Ensure distance do not become so small that r^-12 overflows
+                                r2      = max(r2,NBNXN_MIN_RSQ);
 
                                 inv_r   = rsqrt(r2);
                                 inv_r2  = inv_r * inv_r;
+#if !defined LJ_COMB_LB || defined CALC_ENERGIES
                                 inv_r6  = inv_r2 * inv_r2 * inv_r2;
 #if defined EXCLUSION_FORCES
                                 /* We could mask inv_r2, but with Ewald
@@ -393,6 +459,16 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
                                                      c6 * (inv_r6 + nbparam->dispersion_shift.cpot)*ONE_SIXTH_F);
 
 #endif
+#else                           /* ! LJ_COMB_LB || CALC_ENERGIES */
+                                float sig_r  = sigma*inv_r;
+                                float sig_r2 = sig_r*sig_r;
+                                float sig_r6 = sig_r2*sig_r2*sig_r2;
+#if defined EXCLUSION_FORCES
+                                sig_r6 *= int_bit;
+#endif                          /* EXCLUSION_FORCES */
+
+                                F_invr  = epsilon * sig_r6 * (sig_r6 - 1.0f) * inv_r2;
+#endif                          /* ! LJ_COMB_LB || CALC_ENERGIES */
 
 
 #ifdef LJ_FORCE_SWITCH
@@ -514,6 +590,14 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
         }
     }
 
+    /* skip central shifts when summing shift forces */
+    if (nb_sci.shift == CENTRAL)
+    {
+        bCalcFshift = false;
+    }
+
+    fshift_buf = 0.0f;
+
     /* reduce i forces */
     for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
     {
@@ -529,11 +613,9 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
         barrier(CLK_LOCAL_MEM_FENCE);
     }
 
-    /* add up local shift forces into global mem */    
-       //if (bCalcFshift && tidxj == 0)
-    // atomicAdd_g_f3(&(fshift[3 * nb_sci.shift]),fshift_buf);
+    /* add up local shift forces into global mem */
     if (bCalcFshift)
-    {      
+    {
         /* Only threads with tidxj < 3 will update fshift.
            The threads performing the update must be the same with the threads
            which stored the reduction result in reduce_force_i function
@@ -554,3 +636,5 @@ __global float *restrict fshift,            /* stores float3 values */     /* OU
 #undef EL_EWALD_ANY
 #undef EXCLUSION_FORCES
 #undef LJ_EWALD
+
+#undef LJ_COMB