From a168d2e692fa1228998e222076de8f47956569b3 Mon Sep 17 00:00:00 2001
From: Roland Schulz <roland.schulz@intel.com>
Date: Wed, 28 Mar 2018 18:55:52 -0700
Subject: [PATCH] Allow OCL CL_SIZE to be set to 4 for Intel

Add GMX_OCL_CLUSTER_SIZE which can be set to 4 for e.g. Intel.
The kernel should now work on any HW with at least
CL_SIZE*CL_SIZE/2 wide sub-groups (warp-sync execution).
This is 8(/32) for CL_SIZE 4(/8). Not tested for CL_SIZE other
than 4 or 8.

Fixes:
- make_fep_list_supersub was incorrect for CL_SIZE!=8.
- reduce_force_i_pow2 was incorrect for CL_SIZE<8 and 2 warps.
- i-atom preload, nbnxn_excl_t, warp-any init for CL_SIZE!=8.
- gpu_ref for CL_SIZE!=8.

Change-Id: I1114e408d28b9eb6306722c41fd6a6ccec52211b
---
 cmake/gmxManageOpenCL.cmake                   |  3 ++
 src/config.h.cmakein                          |  3 ++
 src/gromacs/gpu_utils/ocl_compiler.cpp        |  5 ++-
 .../nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp    |  6 ++--
 .../mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh      | 26 +++++++--------
 .../nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh  | 25 +++++++-------
 .../nbnxn_ocl/nbnxn_ocl_kernel_utils.clh      | 33 ++++++++++++++-----
 src/gromacs/mdlib/nbnxn_pairlist.h            |  8 ++++-
 src/gromacs/mdlib/nbnxn_search.cpp            |  3 +-
 9 files changed, 72 insertions(+), 40 deletions(-)

diff --git a/cmake/gmxManageOpenCL.cmake b/cmake/gmxManageOpenCL.cmake
index 4b77330a12..447e02ecad 100644
--- a/cmake/gmxManageOpenCL.cmake
+++ b/cmake/gmxManageOpenCL.cmake
@@ -64,6 +64,9 @@ add_definitions(${OpenCL_DEFINITIONS})
 
 include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS})
 
+set(GMX_OCL_CLUSTER_SIZE 8 CACHE STRING "Cluster size used by OpenCL kernel. Set to 4 for Intel GPUs.")
+mark_as_advanced(GMX_OCL_CLUSTER_SIZE)
+
 macro(gmx_gpu_setup)
     # no OpenMP is no good!
     if(NOT GMX_OPENMP)
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index a5b3598601..347e31acee 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -241,6 +241,9 @@
 /* Use a single compilation unit when compiling the CUDA (non-bonded) kernels.  */
 #cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
 
+/* Cluster size used by OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
+#define GMX_OCL_CLUSTER_SIZE @GMX_OCL_CLUSTER_SIZE@
+
 /* Use NVML */
 #cmakedefine01 HAVE_NVML
 
diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp
index 182ec20192..62f8a4e75f 100644
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -326,8 +326,11 @@ makeVendorFlavorChoice(ocl_vendor_id_t vendorId)
         case OCL_VENDOR_NVIDIA:
             choice = "-D_NVIDIA_SOURCE_";
             break;
+        case OCL_VENDOR_INTEL:
+            choice = "-D_INTEL_SOURCE_";
+            break;
         default:
-            choice = "-D_WARPLESS_SOURCE_";
+            choice = "";
             break;
     }
     return choice;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp
index 309d9a33e4..9ccd46a4eb 100644
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -234,7 +234,9 @@ nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
                                     continue;
                                 }
 
-                                int_bit = ((excl[jc >> 2]->pair[(jc & 3)*c_clSize + ic] >> (jm*c_numClPerSupercl + im)) & 1);
+                                constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
+                                int_bit = ((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic]
+                                            >> (jm*c_numClPerSupercl + im)) & 1);
 
                                 js               = ja*nbat->xstride;
                                 jfs              = ja*nbat->fstride;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
index 41d8e69884..a40eb78610 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -255,24 +255,26 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
     cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
     cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
 
-    /* Pre-load i-atom x and q into shared memory */
-    ci = sci * NCL_PER_SUPERCL + tidxj;
-    ai = ci * CL_SIZE + tidxi;
-
-    xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
-    xqbuf.w *= nbparam->epsfac;
-    xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
+    for (i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+    {
+        /* Pre-load i-atom x and q into shared memory */
+        ci = sci * NCL_PER_SUPERCL + tidxj+i;
+        ai = ci * CL_SIZE + tidxi;
 
+        xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+        xqbuf.w *= nbparam->epsfac;
+        xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf;
 #ifdef IATYPE_SHMEM
 #ifndef LJ_COMB
-    /* Pre-load the i-atom types into shared memory */
-    atib[tidxj * CL_SIZE + tidxi]   = atom_types[ai];
+        /* Pre-load the i-atom types into shared memory */
+        atib[(tidxj + i) * CL_SIZE + tidxi]   = atom_types[ai];
 #else
-    ljcpib[tidxj * CL_SIZE + tidxi] = lj_comb[ai];
+        ljcpib[(tidxj + i) * CL_SIZE + tidxi] = lj_comb[ai];
 #endif
 #endif
+    }
     /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
-    if (tidx == 0 || tidx == 32)
+    if (tidx == 0 || tidx == WARP_SIZE)
     {
         warp_any[widx] = 0;
     }
@@ -625,7 +627,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
         reduce_force_i(f_buf, f,
                        &fshift_buf, bCalcFshift,
                        tidxi, tidxj, ai);
-        barrier(CLK_LOCAL_MEM_FENCE);
     }
 
     /* add up local shift forces into global mem */
@@ -646,7 +647,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
     f_buf[              tidx] = E_lj;
     f_buf[FBUF_STRIDE + tidx] = E_el;
     reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
-
 #endif
 }
 
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
index 126c47adc8..6a5128c7f0 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -42,8 +42,8 @@
  *  \ingroup module_mdlib
  */
 
-#ifndef _WARPLESS_SOURCE_
-/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
  * Note that this should precede the kernel_utils include.
  */
 #define USE_CJ_PREFETCH 1
@@ -150,14 +150,17 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
 
     if (tidxz == 0)
     {
-        /* Pre-load i-atom x and q into shared memory */
-        int ci = sci * c_numClPerSupercl + tidxj;
-        int ai = ci * c_clSize + tidxi;
+        for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+        {
+            /* Pre-load i-atom x and q into shared memory */
+            int ci = sci * c_numClPerSupercl + tidxj+i;
+            int ai = ci * c_clSize + tidxi;
 
-        /* We don't need q, but using float4 in shmem avoids bank conflicts */
-        float4 tmp = xq[ai];
-        float4 xi  = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
-        xib[tidxj * c_clSize + tidxi] = xi;
+            /* We don't need q, but using float4 in shmem avoids bank conflicts */
+            float4 tmp = xq[ai];
+            float4 xi  = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+            xib[(tidxj + i) * c_clSize + tidxi] = xi;
+        }
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
@@ -186,10 +189,6 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
         }
 
         preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck);
-#if defined _WARPLESS_SOURCE_ && USE_CJ_PREFETCH
-        /* can't assume wavefront width, need to sync before we can consume cj4 from local memory */
-        barrier(CLK_LOCAL_MEM_FENCE);
-#endif
 
         if (imaskCheck)
         {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
index 63d13a3bc6..99ac7e1e30 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -38,7 +38,7 @@
 #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
 #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
 
-#define WARP_SIZE  32
+#define WARP_SIZE  (CL_SIZE*CL_SIZE/2)
 
 #undef KERNEL_UTILS_INLINE
 #ifdef KERNEL_UTILS_INLINE
@@ -61,8 +61,16 @@ __constant sampler_t generic_sampler     = (CLK_NORMALIZED_COORDS_FALSE /* Natur
 
 #define __device__
 
+#if CL_SIZE == 8
 #define WARP_SIZE_LOG2  (5)
-#define CL_SIZE_LOG2    (3)  /* change this together with CL_SIZE !*/
+#define CL_SIZE_LOG2    (3)
+#elif CL_SIZE == 4
+#define WARP_SIZE_LOG2  (3)
+#define CL_SIZE_LOG2    (2)
+#else
+#error unsupported CL_SIZE
+#endif
+
 #define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
 #define FBUF_STRIDE     (CL_SIZE_SQ)
 
@@ -142,9 +150,9 @@ typedef struct {
 
 
 typedef struct {
-    unsigned int pair[32]; /* Topology exclusion interaction bits for one warp,
-                            * each unsigned has bitS for 4*8 i clusters
-                            */
+    unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
+                                           * each unsigned has bitS for 4*8 i clusters
+                                           */
 } nbnxn_excl_t;
 
 /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
@@ -210,13 +218,14 @@ int loadCj(__local int        *sm_cjPreload,
     const int c_nbnxnGpuClusterpairSplit = 2;
     const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
 
-#if _NVIDIA_SOURCE_
+#if USE_CJ_PREFETCH
+#if defined _NVIDIA_SOURCE_
     int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
-#else // defined _WARPLESS_SOURCE_ || defined _AMD_SOURCE_
+#elif defined _AMD_SOURCE_
     int warpLoadOffset = 0;
+#else
+#error Not supported
 #endif
-
-#if USE_CJ_PREFETCH
     return sm_cjPreload[jm + warpLoadOffset];
 #else
     return gm_cj[jm];
@@ -583,6 +592,7 @@ void reduce_force_i_generic(__local float *f_buf, __global float *fout,
             (*fshift_buf) += f;
         }
     }
+    barrier(CLK_LOCAL_MEM_FENCE);
 }
 
 /*! Final i-force reduction; this implementation works only with power of two
@@ -610,6 +620,11 @@ void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
         }
         i >>= 1;
     }
+    /* needed because
+     * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+     * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+     */
+    barrier(CLK_LOCAL_MEM_FENCE);
 
     /* i == 1, last reduction step, writing to global mem */
     /* Split the reduction between the first 3 line threads
diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h
index 4e064ec910..352c679146 100644
--- a/src/gromacs/mdlib/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -36,6 +36,8 @@
 #ifndef _nbnxn_pairlist_h
 #define _nbnxn_pairlist_h
 
+#include "config.h"
+
 #include <cstddef>
 
 #include "gromacs/math/vectypes.h"
@@ -74,8 +76,12 @@ struct NbnxnListParameters
 
 /*! \endcond */
 
-/* With GPU kernels the i and j cluster size is 8 atoms */
+/* With GPU kernels the i and j cluster size is 8 atoms for CUDA and can be set at compile time for OpenCL */
+#if GMX_GPU == GMX_GPU_OPENCL
+static constexpr int c_nbnxnGpuClusterSize = GMX_OCL_CLUSTER_SIZE;
+#else
 static constexpr int c_nbnxnGpuClusterSize = 8;
+#endif
 
 /* The number of clusters in a super-cluster, used for GPU */
 static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp
index 917319f910..e9d990bd22 100644
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -1986,7 +1986,8 @@ static void make_fep_list_supersub(const nbnxn_search_t    nbs,
                                     unsigned int  excl_bit;
                                     real          dx, dy, dz;
 
-                                    get_nbl_exclusions_1(nbl, cj4_ind, j>>2, &excl);
+                                    const int     jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit);
+                                    get_nbl_exclusions_1(nbl, cj4_ind, jHalf, &excl);
 
                                     excl_pair = a_mod_wj(j)*nbl->na_ci + i;
                                     excl_bit  = (1U << (gcj*c_gpuNumClusterPerCell + c));
-- 
2.22.0