Merge branch release-2016
[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda_kernel.cuh
index a9411e42ec8aef631f8e62afe8aa2a705da51c1a..c4ec038d2c4504a46c70d8d66ce3c1b475bf1377 100644 (file)
@@ -46,6 +46,7 @@
  */
 
 #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
 #include "gromacs/math/utilities.h"
 #include "gromacs/pbcutil/ishift.h"
 /* Note that floating-point constants in CUDA code should be suffixed
@@ -53,8 +54,8 @@
  * code that is in double precision.
  */
 
-#if GMX_PTX_ARCH < 300
-#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300"
+#if GMX_PTX_ARCH < 300 && GMX_PTX_ARCH != 0
+#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300 or host pass"
 #endif
 
 #if defined EL_EWALD_ANA || defined EL_EWALD_TAB
@@ -85,7 +86,7 @@
    Kernel launch parameters:
     - #blocks   = #pair lists, blockId = pair list Id
     - #threads  = NTHREAD_Z * c_clSize^2
-    - shmem     = see nbnxn_cuda.cu:calc_shmem_required()
+    - shmem     = see nbnxn_cuda.cu:calc_shmem_required_nonbonded()
 
     Each thread calculates an i force-component taking one pair of i-j atoms.
  */
@@ -194,15 +195,12 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #ifdef EL_RF
     float two_k_rf              = nbparam.two_k_rf;
 #endif
-#ifdef EL_EWALD_TAB
-    float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
-#endif
 #ifdef EL_EWALD_ANA
     float beta2                 = nbparam.ewald_beta*nbparam.ewald_beta;
     float beta3                 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
 #endif
 #ifdef PRUNE_NBL
-    float rlist_sq              = nbparam.rlist_sq;
+    float rlist_sq              = nbparam.rlistOuter_sq;
 #endif
 
 #ifdef CALC_ENERGIES
@@ -285,7 +283,8 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
         ci = sci * c_numClPerSupercl + tidxj;
         ai = ci * c_clSize + tidxi;
 
-        xqbuf    = xq[ai] + shift_vec[nb_sci.shift];
+        float  *shiftptr = (float *)&shift_vec[nb_sci.shift];
+        xqbuf    = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f);
         xqbuf.w *= nbparam.epsfac;
         xqib[tidxj * c_clSize + tidxi] = xqbuf;
 
@@ -327,7 +326,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
 
 #ifdef LJ_EWALD
+    #if DISABLE_CUDA_TEXTURES
+            E_lj += LDG(&nbparam.nbfp[atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2]);
+    #else
             E_lj += tex1Dfetch<float>(nbparam.nbfp_texobj, atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2);
+    #endif
 #endif
         }
 
@@ -351,6 +354,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
 #endif                                  /* CALC_ENERGIES */
 
+#ifdef EXCLUSION_FORCES
+    const int nonSelfInteraction  = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
     int          j4LoopStart      = cij4_start + tidxz;
     unsigned int j4LoopThreadMask = gmx_ballot_sync(c_fullWarpMask, j4LoopStart < cij4_end);
     /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
@@ -367,7 +374,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #endif
         {
             /* Pre-load cj into shared memory on both warps separately */
-            if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize)
+            if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
             {
                 cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
             }
@@ -433,10 +440,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
                             /* cutoff & exclusion check */
 #ifdef EXCLUSION_FORCES
-                            if (r2 < rcoulomb_sq *
-                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
 #else
-                            if (r2 < rcoulomb_sq * int_bit)
+                            if ((r2 < rcoulomb_sq) * int_bit)
 #endif
                             {
                                 /* load the rest of the i-atom parameters */
@@ -445,8 +451,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #ifndef LJ_COMB
                                 /* LJ 6*C6 and 12*C12 */
                                 typei   = atib[i * c_clSize + tidxi];
-                                c6      = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej));
-                                c12     = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej) + 1);
+                                fetch_nbfp_c6_c12(c6, c12, nbparam, ntypes * typei + typej);
 #else
                                 ljcp_i  = ljcpib[i * c_clSize + tidxi];
 #ifdef LJ_COMB_GEOM
@@ -520,9 +525,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 
 #ifdef LJ_POT_SWITCH
 #ifdef CALC_ENERGIES
-                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+                                calculate_potential_switch_F_E(nbparam, inv_r, r2, &F_invr, &E_lj_p);
 #else
-                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+                                calculate_potential_switch_F(nbparam, inv_r, r2, &F_invr, &E_lj_p);
 #endif /* CALC_ENERGIES */
 #endif /* LJ_POT_SWITCH */
 
@@ -556,7 +561,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                                 F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
 #elif defined EL_EWALD_TAB
                                 F_invr  += qi * qj_f * (int_bit*inv_r2 -
-                                                        interpolate_coulomb_force_r(nbparam.coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)) * inv_r;
+                                                        interpolate_coulomb_force_r(nbparam, r2 * inv_r)) * inv_r;
 #endif                          /* EL_EWALD_ANA/TAB */
 
 #ifdef CALC_ENERGIES