From 67837a421a087ce05f9ef17c3a1de0fc6ce99870 Mon Sep 17 00:00:00 2001
From: Roland Schulz <roland.schulz@intel.com>
Date: Wed, 25 Apr 2018 22:25:40 -0700
Subject: [PATCH] Use subgroup shuffle for reduction

Change-Id: I121821e70836111adccd0619db992037af6bee3c
---
 .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh      |  65 +---
 .../nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh  |   4 +-
 .../nbnxn_ocl/nbnxn_ocl_kernel_utils.clh      | 280 +++++++++++++-----
 3 files changed, 226 insertions(+), 123 deletions(-)

diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
index a40eb78610..eb7e43fc60 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -98,6 +98,9 @@
    Thus if more strings need to be appended a new macro must be written or it must be directly appended here.
  */
 __attribute__((reqd_work_group_size(CL_SIZE, CL_SIZE, 1)))
+#if REDUCE_SHUFFLE
+__attribute__((intel_reqd_sub_group_size(WARP_SIZE))) //2*WARP_SIZE could be enabled, see comment in reduce_energy_shfl
+#endif
 #ifdef PRUNE_NBL
     #ifdef CALC_ENERGIES
 __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _VF_prune_opencl)
@@ -209,15 +212,14 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #endif
     unsigned int wexcl, imask, mask_ji;
     float4       xqbuf;
-    float3       xi, xj, rv, f_ij, fcj_buf /*, fshift_buf*/;
-    float        fshift_buf;
+    float3       xi, xj, rv, f_ij, fcj_buf;
     float3       fci_buf[NCL_PER_SUPERCL]; /* i force buffer */
     nbnxn_sci_t  nb_sci;
 
     /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL=8 bits set */
     const unsigned superClInteractionMask = ((1U << NCL_PER_SUPERCL) - 1U);
 
-#define LOCAL_OFFSET xqib + NCL_PER_SUPERCL * CL_SIZE
+#define LOCAL_OFFSET (xqib + NCL_PER_SUPERCL * CL_SIZE)
     __local int *cjs;
 #if USE_CJ_PREFETCH
     /* shmem buffer for cj, for both warps separately */
@@ -239,11 +241,13 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
 #endif
 #endif
 
-#ifndef REDUCE_SHUFFLE
+#if !REDUCE_SHUFFLE
     /* shmem j force buffer */
     __local float *f_buf   = (__local float *)(LOCAL_OFFSET);
     #undef LOCAL_OFFSET
-    #define LOCAL_OFFSET f_buf + CL_SIZE * CL_SIZE * 3
+    #define LOCAL_OFFSET (f_buf + CL_SIZE * CL_SIZE * 3)
+#else
+    __local float *f_buf  = 0;
 #endif
     /* Local buffer used to implement __any warp vote function from CUDA.
        volatile is used to avoid compiler optimizations for AMD builds. */
@@ -343,7 +347,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
         imask       = pl_cj4[j4].imei[widx].imask;
         wexcl       = excl[wexcl_idx].pair[(tidx) & (WARP_SIZE - 1)];
 
-        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask);
+        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imask != 0u);
 
 #ifndef PRUNE_NBL
         if (imask)
@@ -388,8 +392,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                     {
                         if (imask & mask_ji)
                         {
-                            ci_offset   = i;                     /* i force buffer offset */
-
                             ci      = sci * NCL_PER_SUPERCL + i; /* i cluster index */
                             ai      = ci * CL_SIZE + tidxi;      /* i atom index */
 
@@ -580,7 +582,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                                 fcj_buf -= f_ij;
 
                                 /* accumulate i forces in registers */
-                                fci_buf[ci_offset] += f_ij;
+                                fci_buf[i] += f_ij;
                             }
                         }
 
@@ -589,13 +591,7 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
                     }
 
                     /* reduce j forces */
-
-                    /* store j forces in shmem */
-                    f_buf[                  tidx] = fcj_buf.x;
-                    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
-                    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
-
-                    reduce_force_j_generic(f_buf, f, tidxi, tidxj, aj);
+                    reduce_force_j(f_buf, fcj_buf, f, tidxi, tidxj, aj);
                 }
             }
 #ifdef PRUNE_NBL
@@ -610,43 +606,14 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
     /* skip central shifts when summing shift forces */
     if (nb_sci.shift == CENTRAL)
     {
-        bCalcFshift = false;
+        bCalcFshift = 0;
     }
-
-    fshift_buf = 0.0f;
-
     /* reduce i forces */
-    for (ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
-    {
-        ai  = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
-
-        f_buf[                  tidx] = fci_buf[ci_offset].x;
-        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
-        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
-        barrier(CLK_LOCAL_MEM_FENCE);
-        reduce_force_i(f_buf, f,
-                       &fshift_buf, bCalcFshift,
-                       tidxi, tidxj, ai);
-    }
-
-    /* add up local shift forces into global mem */
-    if (bCalcFshift)
-    {
-        /* Only threads with tidxj < 3 will update fshift.
-           The threads performing the update must be the same with the threads
-           which stored the reduction result in reduce_force_i function
-         */
-        if (tidxj < 3)
-        {
-            atomicAdd_g_f(&(fshift[3 * nb_sci.shift + tidxj]), fshift_buf);
-        }
-    }
+    reduce_force_i_and_shift(f_buf, fci_buf, f, bCalcFshift != 0, tidxi, tidxj, sci,
+                             nb_sci.shift, fshift);
 
 #ifdef CALC_ENERGIES
-    /* flush the energies to shmem and reduce them */
-    f_buf[              tidx] = E_lj;
-    f_buf[FBUF_STRIDE + tidx] = E_el;
-    reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+    reduce_energy(f_buf, E_lj, E_el, e_lj, e_el, tidx);
 #endif
 }
 
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
index 6a5128c7f0..7d31a58e53 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -121,7 +121,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
     /*! i-cluster interaction mask for a super-cluster with all c_numClPerSupercl=8 bits set */
     const unsigned superClInteractionMask = ((1U << c_numClPerSupercl) - 1U);
 
-    #define LOCAL_OFFSET xib + c_numClPerSupercl * c_clSize
+    #define LOCAL_OFFSET (xib + c_numClPerSupercl * c_clSize)
     /* shmem buffer for i cj pre-loading */
     __local int *cjs;
 #if USE_CJ_PREFETCH
@@ -188,7 +188,7 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
             imaskCheck = (imaskNew ^ imaskFull);
         }
 
-        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck);
+        preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck != 0u);
 
         if (imaskCheck)
         {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
index b882f59012..88aa20bfc1 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -39,7 +39,18 @@
 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
 
-#define WARP_SIZE  (CL_SIZE*CL_SIZE/2)
+#define WARP_SIZE  (CL_SIZE*CL_SIZE/2) //Currently only c_nbnxnGpuClusterpairSplit=2 supported
+
+/* Nvidia+AMD don't support any subgroup extension (2.1 core or cl_khr_subgroups).
+   Code doesn't support CL_SIZE=8.
+   cl_intel_required_subgroup_size required for intel_reqd_sub_group_size.
+   cl_intel_subgroups required for intel_sub_group_shuffle_up/down.
+ */
+#if defined cl_intel_required_subgroup_size && defined cl_intel_subgroups && CL_SIZE == 4
+#define REDUCE_SHUFFLE 1
+#else
+#define REDUCE_SHUFFLE 0
+#endif
 
 /* 1.0 / sqrt(M_PI) */
 #define M_FLOAT_1_SQRTPI 0.564189583547756f
@@ -49,10 +60,6 @@
 #ifndef NBNXN_OPENCL_KERNEL_UTILS_CLH
 #define NBNXN_OPENCL_KERNEL_UTILS_CLH
 
-__constant sampler_t generic_sampler     = (CLK_NORMALIZED_COORDS_FALSE /* Natural coords */
-                                            | CLK_ADDRESS_NONE          /* No clamp/repeat*/
-                                            | CLK_FILTER_NEAREST);      /* No interpolation */
-
 #if CL_SIZE == 8
 #define WARP_SIZE_LOG2  (5)
 #define CL_SIZE_LOG2    (3)
@@ -465,7 +472,7 @@ void calculate_lj_ewald_comb_LB_F_E(__constant float    *nbfp_comb_climg2d,
     /* Subtract the grid force from the total LJ force */
     *F_invr  += c6grid*(inv_r6_nm - expmcr2*(inv_r6_nm*poly + lje_coeff6_6))*inv_r2;
 
-    if (with_E_lj == true)
+    if (with_E_lj)
     {
         float sh_mask;
 
@@ -532,13 +539,42 @@ float pmecorrF(float z2)
     return polyFN0*polyFD0;
 }
 
-/*! Final j-force reduction; this generic implementation works with
- *  arbitrary array sizes.
- */
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_j_shfl(float3 fin, __global float *fout,
+                         int tidxi, int tidxj, int aidx)
+{
+    /* Only does reduction over 4 elements in cluster. Needs to be changed
+     * for CL_SIZE>4. See CUDA code for required code */
+    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 1);
+    fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, 1);
+    fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, 1);
+    if (tidxi & 1 == 1)
+    {
+        fin.x = fin.y;
+    }
+    fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, 2);
+    fin.z += intel_sub_group_shuffle_up  (fin.z, fin.z, 2);
+    if (tidxi == 2)
+    {
+        fin.x = fin.z;
+    }
+    if (tidxi < 3)
+    {
+        atomicAdd_g_f(&fout[3 * aidx + tidxi], fin.x);
+    }
+}
+#endif
+
 gmx_opencl_inline
-void reduce_force_j_generic(__local float *f_buf, __global float *fout,
+void reduce_force_j_generic(__local float *f_buf, float3 fcj_buf, __global float *fout,
                             int tidxi, int tidxj, int aidx)
 {
+    int tidx = tidxi + tidxj*CL_SIZE;
+    f_buf[                  tidx] = fcj_buf.x;
+    f_buf[    FBUF_STRIDE + tidx] = fcj_buf.y;
+    f_buf[2 * FBUF_STRIDE + tidx] = fcj_buf.z;
+
     /* Split the reduction between the first 3 column threads
        Threads with column id 0 will do the reduction for (float3).x components
        Threads with column id 1 will do the reduction for (float3).y components
@@ -556,102 +592,185 @@ void reduce_force_j_generic(__local float *f_buf, __global float *fout,
     }
 }
 
-/*! Final i-force reduction; this generic implementation works with
- *  arbitrary array sizes.
+/*! Final j-force reduction
  */
 gmx_opencl_inline
-void reduce_force_i_generic(__local float *f_buf, __global float *fout,
-                            float *fshift_buf, bool bCalcFshift,
-                            int tidxi, int tidxj, int aidx)
+void reduce_force_j(__local float *f_buf, float3 fcj_buf, __global float *fout,
+                    int tidxi, int tidxj, int aidx)
+{
+#if REDUCE_SHUFFLE
+    reduce_force_j_shfl(fcj_buf, fout, tidxi, tidxj, aidx);
+#else
+    reduce_force_j_generic(f_buf, fcj_buf, fout, tidxi, tidxj, aidx);
+#endif
+}
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_force_i_and_shift_shfl(float3* fci_buf, __global float *fout,
+                                   bool bCalcFshift, int tidxi, int tidxj,
+                                   int sci, int shift, __global float *fshift)
 {
-    /* Split the reduction between the first 3 line threads
-       Threads with line id 0 will do the reduction for (float3).x components
-       Threads with line id 1 will do the reduction for (float3).y components
-       Threads with line id 2 will do the reduction for (float3).z components. */
-    if (tidxj < 3)
+    /* Only does reduction over 4 elements in cluster (2 per warp). Needs to be changed
+     * for CL_SIZE>4.*/
+    float2 fshift_buf = 0;
+    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
     {
-        float f = 0.0f;
-        for (int j = tidxi; j < CL_SIZE_SQ; j += CL_SIZE)
+        int    aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+        float3 fin  = fci_buf[ci_offset];
+        fin.x += intel_sub_group_shuffle_down(fin.x, fin.x, CL_SIZE);
+        fin.y += intel_sub_group_shuffle_up  (fin.y, fin.y, CL_SIZE);
+        fin.z += intel_sub_group_shuffle_down(fin.z, fin.z, CL_SIZE);
+
+        if (tidxj & 1)
         {
-            f += f_buf[tidxj * FBUF_STRIDE + j];
+            fin.x = fin.y;
         }
-
-        atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
-
+        /* Threads 0,1 and 2,3 increment x,y for their warp */
+        atomicAdd_g_f(&fout[3*aidx + (tidxj & 1)], fin.x);
         if (bCalcFshift)
         {
-            (*fshift_buf) += f;
+            fshift_buf[0] += fin.x;
+        }
+        /* Threads 0 and 2 increment z for their warp */
+        if ((tidxj & 1) == 0)
+        {
+            atomicAdd_g_f(&fout[3*aidx+2], fin.z);
+            if (bCalcFshift)
+            {
+                fshift_buf[1] += fin.z;
+            }
+        }
+    }
+    /* add up local shift forces into global mem */
+    if (bCalcFshift)
+    {
+        //Threads 0,1 and 2,3 update x,y
+        atomicAdd_g_f(&(fshift[3 * shift + (tidxj&1)]), fshift_buf[0]);
+        //Threads 0 and 2 update z
+        if ((tidxj & 1) == 0)
+        {
+            atomicAdd_g_f(&(fshift[3 * shift + 2]), fshift_buf[1]);
         }
     }
-    barrier(CLK_LOCAL_MEM_FENCE);
 }
+#endif
 
 /*! Final i-force reduction; this implementation works only with power of two
  *  array sizes.
  */
 gmx_opencl_inline
-void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
-                         float *fshift_buf, bool bCalcFshift,
-                         int tidxi, int tidxj, int aidx)
+void reduce_force_i_and_shift_pow2(volatile __local float *f_buf, float3* fci_buf,
+                                   __global float *fout,
+                                   bool bCalcFshift,
+                                   int tidxi, int tidxj,
+                                   int sci, int shift, __global float *fshift)
 {
-    int     i, j;
-    /* Reduce the initial CL_SIZE values for each i atom to half
-     * every step by using CL_SIZE * i threads.
-     * Can't just use i as loop variable because than nvcc refuses to unroll.
-     */
-    i = CL_SIZE/2;
-    for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
+    float fshift_buf = 0;
+    for (int ci_offset = 0; ci_offset < NCL_PER_SUPERCL; ci_offset++)
     {
-        if (tidxj < i)
+        int aidx = (sci * NCL_PER_SUPERCL + ci_offset) * CL_SIZE + tidxi;
+        int tidx = tidxi + tidxj*CL_SIZE;
+        /* store i forces in shmem */
+        f_buf[                  tidx] = fci_buf[ci_offset].x;
+        f_buf[    FBUF_STRIDE + tidx] = fci_buf[ci_offset].y;
+        f_buf[2 * FBUF_STRIDE + tidx] = fci_buf[ci_offset].z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        int     i, j;
+        /* Reduce the initial CL_SIZE values for each i atom to half
+         * every step by using CL_SIZE * i threads.
+         * Can't just use i as loop variable because than nvcc refuses to unroll.
+         */
+        i = CL_SIZE/2;
+        for (j = CL_SIZE_LOG2 - 1; j > 0; j--)
         {
+            if (tidxj < i)
+            {
+
+                f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
+                f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+                f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+            }
+            i >>= 1;
+        }
+        /* needed because
+         * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+         * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+         * TODO: Test on Nvidia for performance difference between having the barrier here or after the atomicAdd
+         */
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        /* i == 1, last reduction step, writing to global mem */
+        /* Split the reduction between the first 3 line threads
+           Threads with line id 0 will do the reduction for (float3).x components
+           Threads with line id 1 will do the reduction for (float3).y components
+           Threads with line id 2 will do the reduction for (float3).z components. */
+        if (tidxj < 3)
+        {
+            float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
 
-            f_buf[                  tidxj * CL_SIZE + tidxi] += f_buf[                  (tidxj + i) * CL_SIZE + tidxi];
-            f_buf[    FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[    FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
-            f_buf[2 * FBUF_STRIDE + tidxj * CL_SIZE + tidxi] += f_buf[2 * FBUF_STRIDE + (tidxj + i) * CL_SIZE + tidxi];
+            atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
+
+            if (bCalcFshift)
+            {
+                fshift_buf += f;
+            }
         }
-        i >>= 1;
     }
-    /* needed because
-     * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
-     * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
-     */
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* i == 1, last reduction step, writing to global mem */
-    /* Split the reduction between the first 3 line threads
-       Threads with line id 0 will do the reduction for (float3).x components
-       Threads with line id 1 will do the reduction for (float3).y components
-       Threads with line id 2 will do the reduction for (float3).z components. */
-    if (tidxj < 3)
+    /* add up local shift forces into global mem */
+    if (bCalcFshift)
     {
-        float f = f_buf[tidxj * FBUF_STRIDE + tidxi] + f_buf[tidxj * FBUF_STRIDE + i * CL_SIZE + tidxi];
-
-        atomicAdd_g_f(&fout[3 * aidx + tidxj], f);
-
-        if (bCalcFshift)
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update, must be the same as the threads
+           storing the reduction result above.
+         */
+        if (tidxj < 3)
         {
-            (*fshift_buf) += f;
+            atomicAdd_g_f(&(fshift[3 * shift + tidxj]), fshift_buf);
         }
     }
 }
 
-/*! Final i-force reduction wrapper; calls the generic or pow2 reduction depending
- *  on whether the size of the array to be reduced is power of two or not.
+/*! Final i-force reduction
  */
 gmx_opencl_inline
-void reduce_force_i(__local float *f_buf, __global float *f,
-                    float *fshift_buf, bool bCalcFshift,
-                    int tidxi, int tidxj, int ai)
+void reduce_force_i_and_shift(__local float *f_buf, float3* fci_buf, __global float *f,
+                              bool bCalcFshift, int tidxi, int tidxj, int sci,
+                              int shift, __global float *fshift)
 {
-    if ((CL_SIZE & (CL_SIZE - 1)))
-    {
-        reduce_force_i_generic(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
-    }
-    else
+#if REDUCE_SHUFFLE
+    reduce_force_i_and_shift_shfl(fci_buf, f, bCalcFshift, tidxi, tidxj,
+                                  sci, shift, fshift);
+#else
+    reduce_force_i_and_shift_pow2(f_buf, fci_buf, f, bCalcFshift, tidxi, tidxj,
+                                  sci, shift, fshift);
+#endif
+}
+
+
+
+#if REDUCE_SHUFFLE
+gmx_opencl_inline
+void reduce_energy_shfl(float E_lj, float E_el,
+                        __global float *e_lj,
+                        __global float *e_el,
+                        unsigned int    tidx)
+{
+    E_lj = sub_group_reduce_add(E_lj);
+    E_el = sub_group_reduce_add(E_el);
+    /* Should be get_sub_group_local_id()==0. Doesn't work with Intel Classic driver.
+     * To make it possible to use REDUCE_SHUFFLE with single subgroup per i-j pair
+     * (e.g. subgroup size 16 with CL_SIZE 4), either this "if" needs to be changed or
+     * the definition of WARP_SIZE (currently CL_SIZE*CL_SIZE/2) needs to be changed
+     * (by supporting c_nbnxnGpuClusterpairSplit=1). */
+    if (tidx == 0 || tidx == WARP_SIZE)
     {
-        reduce_force_i_pow2(f_buf, f, fshift_buf, bCalcFshift, tidxi, tidxj, ai);
+        atomicAdd_g_f(e_lj, E_lj);
+        atomicAdd_g_f(e_el, E_el);
     }
 }
+#endif
 
 /*! Energy reduction; this implementation works only with power of two
  *  array sizes.
@@ -689,4 +808,21 @@ void reduce_energy_pow2(volatile __local float  *buf,
     }
 }
 
+gmx_opencl_inline
+void reduce_energy(volatile __local float  *buf,
+                   float E_lj, float E_el,
+                   volatile __global float *e_lj,
+                   volatile __global float *e_el,
+                   unsigned int             tidx)
+{
+#if REDUCE_SHUFFLE
+    reduce_energy_shfl(E_lj, E_el, e_lj, e_el, tidx);
+#else
+    /* flush the energies to shmem and reduce them */
+    buf[              tidx] = E_lj;
+    buf[FBUF_STRIDE + tidx] = E_el;
+    reduce_energy_pow2(buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
+#endif
+}
+
 #endif /* NBNXN_OPENCL_KERNEL_UTILS_CLH */
-- 
2.22.0