Merge branch release-2016

[alexxy/gromacs.git] / src / gromacs / mdlib / nbnxn_cuda / nbnxn_cuda_kernel_utils.cuh
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh

index 3c0f016515a9842d921f359fee20c778bd0e4cf7..71f1901434c914cb5ef53782aca3070d42f0bac2 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda_kernel_utils.cuh
@@ -66,18 +66,18 @@
  
  /*! \brief Log of the i and j cluster size.
   *  change this together with c_clSize !*/
-static const int      c_clSizeLog2  = 3;
+static const int          c_clSizeLog2  = 3;
  /*! \brief Square of cluster size. */
-static const int      c_clSizeSq    = c_clSize*c_clSize;
+static const int          c_clSizeSq    = c_clSize*c_clSize;
  /*! \brief j-cluster size after split (4 in the current implementation). */
-static const int      c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
+static const int          c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
  /*! \brief Stride in the force accumualation buffer */
-static const int      c_fbufStride  = c_clSizeSq;
+static const int          c_fbufStride  = c_clSizeSq;
  /*! \brief i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
-static const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
+static const unsigned     superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
  
-static const float    c_oneSixth    = 0.16666667f;
-static const float    c_oneTwelveth = 0.08333333f;
+static const float        c_oneSixth    = 0.16666667f;
+static const float        c_oneTwelveth = 0.08333333f;
  
  
  /*! Convert LJ sigma,epsilon parameters to C6,C12. */
@@ -537,26 +537,27 @@ void reduce_force_j_generic(float *f_buf, float3 *fout,
  #if GMX_PTX_ARCH >= 300 || GMX_PTX_ARCH == 0
  static __forceinline__ __device__
  void reduce_force_j_warp_shfl(float3 f, float3 *fout,
-                              int tidxi, int aidx)
+                              int tidxi, int aidx,
+                              const unsigned int activemask)
  {
-    f.x += __shfl_down(f.x, 1);
-    f.y += __shfl_up  (f.y, 1);
-    f.z += __shfl_down(f.z, 1);
+    f.x += gmx_shfl_down_sync(activemask, f.x, 1);
+    f.y += gmx_shfl_up_sync  (activemask, f.y, 1);
+    f.z += gmx_shfl_down_sync(activemask, f.z, 1);
  
      if (tidxi & 1)
      {
          f.x = f.y;
      }
  
-    f.x += __shfl_down(f.x, 2);
-    f.z += __shfl_up  (f.z, 2);
+    f.x += gmx_shfl_down_sync(activemask, f.x, 2);
+    f.z += gmx_shfl_up_sync  (activemask, f.z, 2);
  
      if (tidxi & 2)
      {
          f.x = f.z;
      }
  
-    f.x += __shfl_down(f.x, 4);
+    f.x += gmx_shfl_down_sync(activemask, f.x, 4);
  
      if (tidxi < 3)
      {
@@ -664,19 +665,20 @@ void reduce_force_i(float *f_buf, float3 *f,
  static __forceinline__ __device__
  void reduce_force_i_warp_shfl(float3 fin, float3 *fout,
                                float *fshift_buf, bool bCalcFshift,
-                              int tidxj, int aidx)
+                              int tidxj, int aidx,
+                              const unsigned int activemask)
  {
-    fin.x += __shfl_down(fin.x, c_clSize);
-    fin.y += __shfl_up  (fin.y, c_clSize);
-    fin.z += __shfl_down(fin.z, c_clSize);
+    fin.x += gmx_shfl_down_sync(activemask, fin.x, c_clSize);
+    fin.y += gmx_shfl_up_sync  (activemask, fin.y, c_clSize);
+    fin.z += gmx_shfl_down_sync(activemask, fin.z, c_clSize);
  
      if (tidxj & 1)
      {
          fin.x = fin.y;
      }
  
-    fin.x += __shfl_down(fin.x, 2*c_clSize);
-    fin.z += __shfl_up  (fin.z, 2*c_clSize);
+    fin.x += gmx_shfl_down_sync(activemask, fin.x, 2*c_clSize);
+    fin.z += gmx_shfl_up_sync  (activemask, fin.z, 2*c_clSize);
  
      if (tidxj & 2)
      {
@@ -741,7 +743,8 @@ void reduce_energy_pow2(volatile float *buf,
  static __forceinline__ __device__
  void reduce_energy_warp_shfl(float E_lj, float E_el,
                               float *e_lj, float *e_el,
-                             int tidx)
+                             int tidx,
+                             const unsigned int activemask)
  {
      int i, sh;
  
@@ -749,8 +752,8 @@ void reduce_energy_warp_shfl(float E_lj, float E_el,
  #pragma unroll 5
      for (i = 0; i < 5; i++)
      {
-        E_lj += __shfl_down(E_lj, sh);
-        E_el += __shfl_down(E_el, sh);
+        E_lj += gmx_shfl_down_sync(activemask, E_lj, sh);
+        E_el += gmx_shfl_down_sync(activemask, E_el, sh);
          sh   += sh;
      }