Merge branch release-2016

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda_kernel.cuh
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh

index a9411e42ec8aef631f8e62afe8aa2a705da51c1a..c4ec038d2c4504a46c70d8d66ce3c1b475bf1377 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel.cuh
@@ -46,6 +46,7 @@
   */
  
  #include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cuda_kernel_utils.cuh"
  #include "gromacs/math/utilities.h"
  #include "gromacs/pbcutil/ishift.h"
  /* Note that floating-point constants in CUDA code should be suffixed
@@ -53,8 +54,8 @@
   * code that is in double precision.
   */
  
-#if GMX_PTX_ARCH < 300
-#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300"
+#if GMX_PTX_ARCH < 300 && GMX_PTX_ARCH != 0
+#error "nbnxn_cuda_kernel.cuh included with GMX_PTX_ARCH < 300 or host pass"
  #endif
  
  #if defined EL_EWALD_ANA || defined EL_EWALD_TAB
@@ -85,7 +86,7 @@
     Kernel launch parameters:
      - #blocks   = #pair lists, blockId = pair list Id
      - #threads  = NTHREAD_Z * c_clSize^2
-    - shmem     = see nbnxn_cuda.cu:calc_shmem_required()
+    - shmem     = see nbnxn_cuda.cu:calc_shmem_required_nonbonded()
  
      Each thread calculates an i force-component taking one pair of i-j atoms.
   */
@@ -194,15 +195,12 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #ifdef EL_RF
      float two_k_rf              = nbparam.two_k_rf;
  #endif
-#ifdef EL_EWALD_TAB
-    float coulomb_tab_scale     = nbparam.coulomb_tab_scale;
-#endif
  #ifdef EL_EWALD_ANA
      float beta2                 = nbparam.ewald_beta*nbparam.ewald_beta;
      float beta3                 = nbparam.ewald_beta*nbparam.ewald_beta*nbparam.ewald_beta;
  #endif
  #ifdef PRUNE_NBL
-    float rlist_sq              = nbparam.rlist_sq;
+    float rlist_sq              = nbparam.rlistOuter_sq;
  #endif
  
  #ifdef CALC_ENERGIES
@@ -285,7 +283,8 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
          ci = sci * c_numClPerSupercl + tidxj;
          ai = ci * c_clSize + tidxi;
  
-        xqbuf    = xq[ai] + shift_vec[nb_sci.shift];
+        float  *shiftptr = (float *)&shift_vec[nb_sci.shift];
+        xqbuf    = xq[ai] + make_float4(LDG(shiftptr), LDG(shiftptr + 1), LDG(shiftptr + 2), 0.0f);
          xqbuf.w *= nbparam.epsfac;
          xqib[tidxj * c_clSize + tidxi] = xqbuf;
  
@@ -327,7 +326,11 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #endif
  
  #ifdef LJ_EWALD
+    #if DISABLE_CUDA_TEXTURES
+            E_lj += LDG(&nbparam.nbfp[atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2]);
+    #else
              E_lj += tex1Dfetch<float>(nbparam.nbfp_texobj, atom_types[(sci*c_numClPerSupercl + i)*c_clSize + tidxi]*(ntypes + 1)*2);
+    #endif
  #endif
          }
  
@@ -351,6 +354,10 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
  #endif                                  /* CALC_ENERGIES */
  
+#ifdef EXCLUSION_FORCES
+    const int nonSelfInteraction  = !(nb_sci.shift == CENTRAL & tidxj <= tidxi);
+#endif
+
      int          j4LoopStart      = cij4_start + tidxz;
      unsigned int j4LoopThreadMask = gmx_ballot_sync(c_fullWarpMask, j4LoopStart < cij4_end);
      /* loop over the j clusters = seen by any of the atoms in the current super-cluster */
@@ -367,7 +374,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #endif
          {
              /* Pre-load cj into shared memory on both warps separately */
-            if ((tidxj == 0 || tidxj == 4) && tidxi < c_nbnxnGpuJgroupSize)
+            if ((tidxj == 0 | tidxj == 4) & (tidxi < c_nbnxnGpuJgroupSize))
              {
                  cjs[tidxi + tidxj * c_nbnxnGpuJgroupSize/c_splitClSize] = pl_cj4[j4].cj[tidxi];
              }
@@ -433,10 +440,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
                              /* cutoff & exclusion check */
  #ifdef EXCLUSION_FORCES
-                            if (r2 < rcoulomb_sq *
-                                (nb_sci.shift != CENTRAL || ci != cj || tidxj > tidxi))
+                            if ((r2 < rcoulomb_sq) * (nonSelfInteraction | (ci != cj)))
  #else
-                            if (r2 < rcoulomb_sq * int_bit)
+                            if ((r2 < rcoulomb_sq) * int_bit)
  #endif
                              {
                                  /* load the rest of the i-atom parameters */
@@ -445,8 +451,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  #ifndef LJ_COMB
                                  /* LJ 6*C6 and 12*C12 */
                                  typei   = atib[i * c_clSize + tidxi];
-                                c6      = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej));
-                                c12     = tex1Dfetch<float>(nbparam.nbfp_texobj, 2 * (ntypes * typei + typej) + 1);
+                                fetch_nbfp_c6_c12(c6, c12, nbparam, ntypes * typei + typej);
  #else
                                  ljcp_i  = ljcpib[i * c_clSize + tidxi];
  #ifdef LJ_COMB_GEOM
@@ -520,9 +525,9 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
  
  #ifdef LJ_POT_SWITCH
  #ifdef CALC_ENERGIES
-                                calculate_potential_switch_F_E(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+                                calculate_potential_switch_F_E(nbparam, inv_r, r2, &F_invr, &E_lj_p);
  #else
-                                calculate_potential_switch_F(nbparam, c6, c12, inv_r, r2, &F_invr, &E_lj_p);
+                                calculate_potential_switch_F(nbparam, inv_r, r2, &F_invr, &E_lj_p);
  #endif /* CALC_ENERGIES */
  #endif /* LJ_POT_SWITCH */
  
@@ -556,7 +561,7 @@ __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
                                  F_invr  += qi * qj_f * (int_bit*inv_r2*inv_r + pmecorrF(beta2*r2)*beta3);
  #elif defined EL_EWALD_TAB
                                  F_invr  += qi * qj_f * (int_bit*inv_r2 -
-                                                        interpolate_coulomb_force_r(nbparam.coulomb_tab_texobj, r2 * inv_r, coulomb_tab_scale)) * inv_r;
+                                                        interpolate_coulomb_force_r(nbparam, r2 * inv_r)) * inv_r;
  #endif                          /* EL_EWALD_ANA/TAB */
  
  #ifdef CALC_ENERGIES