Use subgroup shuffle for reduction

author Roland Schulz <roland.schulz@intel.com>

Thu, 26 Apr 2018 05:25:40 +0000 (22:25 -0700)

committer Mark Abraham <mark.j.abraham@gmail.com>

Tue, 2 Oct 2018 07:17:24 +0000 (09:17 +0200)
author Roland Schulz <roland.schulz@intel.com>
Thu, 26 Apr 2018 05:25:40 +0000 (22:25 -0700)
committer Mark Abraham <mark.j.abraham@gmail.com>
Tue, 2 Oct 2018 07:17:24 +0000 (09:17 +0200)
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh

index a40eb78610feb22add40b52903c0d7f187130438..eb7e43fc60ccd14c509bc1ac2ac263fa69b2a0d5 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -98,6 +98,9 @@
     Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
   */
  __attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#if REDUCE_SHUFFLE
+__attribute__((intel_reqd_sub_group_size(WARP_SIZE))) //2*WARP_SIZE could be enabled, see comment in reduce_energy_shfl
+#endif
  #ifdef PRUNE_NBL
      #ifdef CALC_ENERGIES
  __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
@@ -209,15 +212,14 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif
      unsigned int wexcl, imask, mask_ji;
      float4       xqbuf;
-    float3       xi, xj, rv, f_ij, fcj_buf /*, fshift_buf*/;
-    float        fshift_buf;
+    float3       xi, xj, rv, f_ij, fcj_buf;
      float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
      nbnxn_sci_t  nb_sci;
  
      /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
      const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
  
-#define LOCAL_OFFSET xqib + NCL_PER_SUPERCL * CL_SIZE
+#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
      __local int *cjs;
  #if USE_CJ_PREFETCH
      /* shmem buffer for cj, for both warps separately */
@@ -239,11 +241,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
  #endif
  #endif
  
-#ifndef REDUCE_SHUFFLE
+#if !REDUCE_SHUFFLE
      /* shmem j force buffer */
      __local float *f_buf   = (__local float *)(LOCAL_OFFSET);
      #undef LOCAL_OFFSET
-    #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+    #define LOCAL_OFFSET (f_buf + CL_SIZE * CL_SIZE * 3)
+#else
+    __local float *f_buf  = 0;
  #endif
      /* Local buffer used to implement __any warp vote function from CUDA.
         volatile is used to avoid compiler optimizations for AMD builds. */
@@ -343,7 +347,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
          imask       = pl_cj4[j4].imei[widx].imask;
          wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
  
-        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask);
+        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
  
  #ifndef PRUNE_NBL
          if (imask)
@@ -388,8 +392,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                      {
                          if (imask & mask_ji)
                          {
-                            ci_offset   = i;                     /* i force buffer offset */
-
                              ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
                              ai      = ci * CL_SIZE + tidxi;      /* i atom index */
  
@@ -580,7 +582,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                                  fcj_buf -= f_ij;
  
                                  /* accumulate i forces in registers */
-                                fci_buf[ci_offset] += f_ij;
+                                fci_buf[i] += f_ij;
                              }
                          }
  
@@ -589,13 +591,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                      }
  
                      /* reduce j forces */
-
-                    /* store j forces in shmem */
-                    f_buf[                  tidx] = fcj_buf.x;
-                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
-                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
-
-                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                    reduce_force_j(f_buf, fcj_buf, f, tidxi, tidxj, aj);
                  }
              }
  #ifdef PRUNE_NBL
@@ -610,43 +606,14 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      /* skip central shifts when summing shift forces */
      if (nb_sci.shift == CENTRAL)
      {
-        bCalcFshift = false;
+        bCalcFshift = 0;
      }
-
-    fshift_buf = 0.0f;
-
      /* reduce i forces */
-    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
-    {
-        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
-
-        f_buf[                  tidx] = fci_buf[ci_offset].x;
-        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
-        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        reduce_force_i(f_buf, f,
-                       &fshift_buf, bCalcFshift,
-                       tidxi, tidxj, ai);
-    }
-
-    /* add up local shift forces into global mem */
-    if (bCalcFshift)
-    {
-        /* Only threads with tidxj < 3 will update fshift.
-           The threads performing the update must be the same with the threads
-           which stored the reduction result in reduce_force_i function
-         */
-        if (tidxj < 3)
-        {
-            atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
-        }
-    }
+    reduce_force_i_and_shift(f_buf, fci_buf, f, bCalcFshift != 0, tidxi, tidxj, sci,
+                             nb_sci.shift, fshift);
  
  #ifdef CALC_ENERGIES
-    /* flush the energies to shmem and reduce them */
-    f_buf[              tidx] = E_lj;
-    f_buf[FBUF_STRIDE + tidx] = E_el;
-    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+    reduce_energy(f_buf, E_lj, E_el, e_lj, e_el, tidx);
  #endif
  }
  
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh

index 6a5128c7f02738f3567ea452dfae4f7162d5b19e..7d31a58e538d7d872dcefca42ac66496465ced7e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -121,7 +121,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
      /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
      const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
  
-    #define LOCAL_OFFSET xib + c_numClPerSupercl * c_clSize
+    #define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize)
      /* shmem buffer for i cj pre-loading */
      __local int *cjs;
  #if USE_CJ_PREFETCH
@@ -188,7 +188,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
              imaskCheck = (imaskNew ^ imaskFull);
          }
  
-        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck);
+        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
  
          if (imaskCheck)
          {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

index b882f5901227e2cc0499fa17e7124e3cf3d67476..88aa20bfc1a9de42e347e08d79254b4fd82897ff 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -39,7 +39,18 @@
  #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  
-#define WARP_SIZE  (CL_SIZE*CL_SIZE/2)
+#define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
+
+/* Nvidia+AMD don't support any subgroup extension (2.1 core or cl_khr_subgroups).
+   Code doesn't support CL_SIZE=8.
+   cl_intel_required_subgroup_size required for intel_reqd_sub_group_size.
+   cl_intel_subgroups required for intel_sub_group_shuffle_up/down.
+ */
+#if defined cl_intel_required_subgroup_size && defined cl_intel_subgroups && CL_SIZE == 4
+#define REDUCE_SHUFFLE 1
+#else
+#define REDUCE_SHUFFLE 0
+#endif
  
  /* 1.0 / sqrt(M_PI) */
  #define M_FLOAT_1_SQRTPI 0.564189583547756f
@@ -49,10 +60,6 @@
  #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
  #define NBNXN_OPENCL_KERNEL_UTILS_CLH
  
-__constant sampler_t generic_sampler     = (CLK_NORMALIZED_COORDS_FALSE /* Natural coords */
-                                            | CLK_ADDRESS_NONE          /* No clamp/repeat*/
-                                            | CLK_FILTER_NEAREST);      /* No interpolation */
-
  #if CL_SIZE == 8
  #define WARP_SIZE_LOG2  (5)
  #define CL_SIZE_LOG2    (3)
@@ -465,7 +472,7 @@ void calculate_lj_ewald_comb_LB_F_E(__constant float    *nbfp_comb_climg2d,
      /* Subtract the grid force from the total LJ force */
      *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
  
-    if (with_E_lj == true)
+    if (with_E_lj)
      {
          float sh_mask;
  
@@ -532,13 +539,42 @@ float pmecorrF(float z2)
      return polyFN0*polyFD0;
  }
  
-/*! Final j-force reduction; this generic implementation works with
- *  arbitrary array sizes.
- */
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_j_shfl(float3 fin, __global float *fout,
+                         int tidxi, int tidxj, int aidx)
+{
+    /* Only does reduction over 4 elements in cluster. Needs to be changed
+     * for CL_SIZE>4. See CUDA code for required code */
+    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
+    fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
+    fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
+    if (tidxi & 1 == 1)
+    {
+        fin.x = fin.y;
+    }
+    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
+    fin.z += intel_sub_group_shuffle_up  (fin.z, fin.z, 2);
+    if (tidxi == 2)
+    {
+        fin.x = fin.z;
+    }
+    if (tidxi < 3)
+    {
+        atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
+    }
+}
+#endif
+
  gmx_opencl_inline
-void reduce_force_j_generic(__local float *f_buf, __global float *fout,
+void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
                              int tidxi, int tidxj, int aidx)
  {
+    int tidx = tidxi + tidxj*CL_SIZE;
+    f_buf[                  tidx] = fcj_buf.x;
+    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
      /* Split the reduction between the first 3 column threads
         Threads with column id 0 will do the reduction for (float3).x components
         Threads with column id 1 will do the reduction for (float3).y components
@@ -556,102 +592,185 @@ void reduce_force_j_generic(__local float *f_buf, __global float *fout,
      }
  }
  
-/*! Final i-force reduction; this generic implementation works with
- *  arbitrary array sizes.
+/*! Final j-force reduction
   */
  gmx_opencl_inline
-void reduce_force_i_generic(__local float *f_buf, __global float *fout,
-                            float *fshift_buf, bool bCalcFshift,
-                            int tidxi, int tidxj, int aidx)
+void reduce_force_j(__local float *f_buf, float3 fcj_buf, __global float *fout,
+                    int tidxi, int tidxj, int aidx)
+{
+#if REDUCE_SHUFFLE
+    reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
+#else
+    reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
+#endif
+}
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
+                                   bool bCalcFshift, int tidxi, int tidxj,
+                                   int sci, int shift, __global float *fshift)
  {
-    /* Split the reduction between the first 3 line threads
-       Threads with line id 0 will do the reduction for (float3).x components
-       Threads with line id 1 will do the reduction for (float3).y components
-       Threads with line id 2 will do the reduction for (float3).z components. */
-    if (tidxj < 3)
+    /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
+     * for CL_SIZE>4.*/
+    float2 fshift_buf = 0;
+    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
      {
-        float f = 0.0f;
-        for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
+        int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+        float3 fin  = fci_buf[ci_offset];
+        fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
+        fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, CL_SIZE);
+        fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
+
+        if (tidxj & 1)
          {
-            f += f_buf[tidxj * FBUF_STRIDE + j];
+            fin.x = fin.y;
          }
-
-        atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
-
+        /* Threads 0,1 and 2,3 increment x,y for their warp */
+        atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
          if (bCalcFshift)
          {
-            (*fshift_buf) += f;
+            fshift_buf[0] += fin.x;
+        }
+        /* Threads 0 and 2 increment z for their warp */
+        if ((tidxj & 1) == 0)
+        {
+            atomicAdd_g_f(&fout[3*aidx+2], fin.z);
+            if (bCalcFshift)
+            {
+                fshift_buf[1] += fin.z;
+            }
+        }
+    }
+    /* add up local shift forces into global mem */
+    if (bCalcFshift)
+    {
+        //Threads 0,1 and 2,3 update x,y
+        atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
+        //Threads 0 and 2 update z
+        if ((tidxj & 1) == 0)
+        {
+            atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
          }
      }
-    barrier(CLK_LOCAL_MEM_FENCE);
  }
+#endif
  
  /*! Final i-force reduction; this implementation works only with power of two
   *  array sizes.
   */
  gmx_opencl_inline
-void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
-                         float *fshift_buf, bool bCalcFshift,
-                         int tidxi, int tidxj, int aidx)
+void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
+                                   __global float *fout,
+                                   bool bCalcFshift,
+                                   int tidxi, int tidxj,
+                                   int sci, int shift, __global float *fshift)
  {
-    int     i, j;
-    /* Reduce the initial CL_SIZE values for each i atom to half
-     * every step by using CL_SIZE * i threads.
-     * Can't just use i as loop variable because than nvcc refuses to unroll.
-     */
-    i = CL_SIZE/2;
-    for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
+    float fshift_buf = 0;
+    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
      {
-        if (tidxj < i)
+        int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+        int tidx = tidxi + tidxj*CL_SIZE;
+        /* store i forces in shmem */
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int     i, j;
+        /* Reduce the initial CL_SIZE values for each i atom to half
+         * every step by using CL_SIZE * i threads.
+         * Can't just use i as loop variable because than nvcc refuses to unroll.
+         */
+        i = CL_SIZE/2;
+        for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
          {
+            if (tidxj < i)
+            {
+
+                f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
+                f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+                f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+            }
+            i >>= 1;
+        }
+        /* needed because
+         * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+         * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+         * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
+         */
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        /* i == 1, last reduction step, writing to global mem */
+        /* Split the reduction between the first 3 line threads
+           Threads with line id 0 will do the reduction for (float3).x components
+           Threads with line id 1 will do the reduction for (float3).y components
+           Threads with line id 2 will do the reduction for (float3).z components. */
+        if (tidxj < 3)
+        {
+            float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
  
-            f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
-            f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
-            f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+            atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+            if (bCalcFshift)
+            {
+                fshift_buf += f;
+            }
          }
-        i >>= 1;
      }
-    /* needed because
-     * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
-     * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
-     */
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* i == 1, last reduction step, writing to global mem */
-    /* Split the reduction between the first 3 line threads
-       Threads with line id 0 will do the reduction for (float3).x components
-       Threads with line id 1 will do the reduction for (float3).y components
-       Threads with line id 2 will do the reduction for (float3).z components. */
-    if (tidxj < 3)
+    /* add up local shift forces into global mem */
+    if (bCalcFshift)
      {
-        float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
-
-        atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
-
-        if (bCalcFshift)
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update, must be the same as the threads
+           storing the reduction result above.
+         */
+        if (tidxj < 3)
          {
-            (*fshift_buf) += f;
+            atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
          }
      }
  }
  
-/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
- *  on whether the size of the array to be reduced is power of two or not.
+/*! Final i-force reduction
   */
  gmx_opencl_inline
-void reduce_force_i(__local float *f_buf, __global float *f,
-                    float *fshift_buf, bool bCalcFshift,
-                    int tidxi, int tidxj, int ai)
+void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global float *f,
+                              bool bCalcFshift, int tidxi, int tidxj, int sci,
+                              int shift, __global float *fshift)
  {
-    if ((CL_SIZE & (CL_SIZE - 1)))
-    {
-        reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
-    }
-    else
+#if REDUCE_SHUFFLE
+    reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
+                                  sci, shift, fshift);
+#else
+    reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
+                                  sci, shift, fshift);
+#endif
+}
+
+
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_energy_shfl(float E_lj, float E_el,
+                        __global float *e_lj,
+                        __global float *e_el,
+                        unsigned int    tidx)
+{
+    E_lj = sub_group_reduce_add(E_lj);
+    E_el = sub_group_reduce_add(E_el);
+    /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
+     * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
+     * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
+     * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
+     * (by supporting c_nbnxnGpuClusterpairSplit=1). */
+    if (tidx == 0 || tidx == WARP_SIZE)
      {
-        reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+        atomicAdd_g_f(e_lj, E_lj);
+        atomicAdd_g_f(e_el, E_el);
      }
  }
+#endif
  
  /*! Energy reduction; this implementation works only with power of two
   *  array sizes.
@@ -689,4 +808,21 @@ void reduce_energy_pow2(volatile __local float  *buf,
      }
  }
  
+gmx_opencl_inline
+void reduce_energy(volatile __local float  *buf,
+                   float E_lj, float E_el,
+                   volatile __global float *e_lj,
+                   volatile __global float *e_el,
+                   unsigned int             tidx)
+{
+#if REDUCE_SHUFFLE
+    reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
+#else
+    /* flush the energies to shmem and reduce them */
+    buf[              tidx] = E_lj;
+    buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+#endif
+}
+
  #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
author	Roland Schulz <roland.schulz@intel.com>
	Thu, 26 Apr 2018 05:25:40 +0000 (22:25 -0700)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 2 Oct 2018 07:17:24 +0000 (09:17 +0200)
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh		patch \| blob \| history