Allow OCL CL_SIZE to be set to 4 for Intel

author Roland Schulz <roland.schulz@intel.com>

Thu, 29 Mar 2018 01:55:52 +0000 (18:55 -0700)

committer Roland Schulz <roland.schulz@intel.com>

Tue, 15 May 2018 16:25:09 +0000 (09:25 -0700)
author Roland Schulz <roland.schulz@intel.com>
Thu, 29 Mar 2018 01:55:52 +0000 (18:55 -0700)
committer Roland Schulz <roland.schulz@intel.com>
Tue, 15 May 2018 16:25:09 +0000 (09:25 -0700)
diff --git a/cmake/gmxManageOpenCL.cmake b/cmake/gmxManageOpenCL.cmake

index 4b77330a12c97cd0255b8222e2119e7f1edb39fa..447e02ecad9515c78dd1b27b24e908ba384a2b79 100644 (file)
--- a/cmake/gmxManageOpenCL.cmake
+++ b/cmake/gmxManageOpenCL.cmake
@@ -64,6 +64,9 @@ add_definitions(${OpenCL_DEFINITIONS})
  
  include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS})
  
+set(GMX_OCL_CLUSTER_SIZE 8 CACHE STRING "Cluster size used by OpenCL kernel. Set to 4 for Intel GPUs.")
+mark_as_advanced(GMX_OCL_CLUSTER_SIZE)
+
  macro(gmx_gpu_setup)
      # no OpenMP is no good!
      if(NOT GMX_OPENMP)
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index a5b359860157a436fd2e069e7ab67e07a410b7d8..347e31aceed73fac6ad02b105dbeb15d89cd1190 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -241,6 +241,9 @@
  /* Use a single compilation unit when compiling the CUDA (non-bonded) kernels.  */
  #cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
  
+/* Cluster size used by OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
+#define GMX_OCL_CLUSTER_SIZE @GMX_OCL_CLUSTER_SIZE@
+
  /* Use NVML */
  #cmakedefine01 HAVE_NVML
  
diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp

index 182ec20192e01d2a490a6ba0bc3a0a22bd0e5f72..62f8a4e75f7a872f08b06fe4156ec0188c8ed768 100644 (file)
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -326,8 +326,11 @@ makeVendorFlavorChoice(ocl_vendor_id_t vendorId)
          case OCL_VENDOR_NVIDIA:
              choice = "-D_NVIDIA_SOURCE_";
              break;
+        case OCL_VENDOR_INTEL:
+            choice = "-D_INTEL_SOURCE_";
+            break;
          default:
-            choice = "-D_WARPLESS_SOURCE_";
+            choice = "";
              break;
      }
      return choice;
diff --git a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp

index 309d9a33e421e947944cf81a9e6697bc31d26853..9ccd46a4ebb788e9e1f4ecd31c4b41e34fc8f22f 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp
+++ b/src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -234,7 +234,9 @@ nbnxn_kernel_gpu_ref(const nbnxn_pairlist_t     *nbl,
                                      continue;
                                  }
  
-                                int_bit = ((excl[jc >> 2]->pair[(jc & 3)*c_clSize + ic] >> (jm*c_numClPerSupercl + im)) & 1);
+                                constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
+                                int_bit = ((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic]
+                                            >> (jm*c_numClPerSupercl + im)) & 1);
  
                                  js               = ja*nbat->xstride;
                                  jfs              = ja*nbat->fstride;
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh

index 41d8e6988467e7703e735849124dadaae460f0a7..a40eb78610feb22add40b52903c0d7f187130438 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh
@@ -255,24 +255,26 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      cij4_start  = nb_sci.cj4_ind_start; /* first ...*/
      cij4_end    = nb_sci.cj4_ind_end;   /* and last index of j clusters */
  
-    /* Pre-load i-atom x and q into shared memory */
-    ci = sci * NCL_PER_SUPERCL + tidxj;
-    ai = ci * CL_SIZE + tidxi;
-
-    xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
-    xqbuf.w *= nbparam->epsfac;
-    xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
+    for (i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+    {
+        /* Pre-load i-atom x and q into shared memory */
+        ci = sci * NCL_PER_SUPERCL + tidxj+i;
+        ai = ci * CL_SIZE + tidxi;
  
+        xqbuf    = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+        xqbuf.w *= nbparam->epsfac;
+        xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf;
  #ifdef IATYPE_SHMEM
  #ifndef LJ_COMB
-    /* Pre-load the i-atom types into shared memory */
-    atib[tidxj * CL_SIZE + tidxi]   = atom_types[ai];
+        /* Pre-load the i-atom types into shared memory */
+        atib[(tidxj + i) * CL_SIZE + tidxi]   = atom_types[ai];
  #else
-    ljcpib[tidxj * CL_SIZE + tidxi] = lj_comb[ai];
+        ljcpib[(tidxj + i) * CL_SIZE + tidxi] = lj_comb[ai];
  #endif
  #endif
+    }
      /* Initialise warp vote. (8x8 block) 2 warps for nvidia */
-    if (tidx == 0 || tidx == 32)
+    if (tidx == 0 || tidx == WARP_SIZE)
      {
          warp_any[widx] = 0;
      }
@@ -625,7 +627,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
          reduce_force_i(f_buf, f,
                         &fshift_buf, bCalcFshift,
                         tidxi, tidxj, ai);
-        barrier(CLK_LOCAL_MEM_FENCE);
      }
  
      /* add up local shift forces into global mem */
@@ -646,7 +647,6 @@ __kernel void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_opencl)
      f_buf[              tidx] = E_lj;
      f_buf[FBUF_STRIDE + tidx] = E_el;
      reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
-
  #endif
  }
  
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh

index 126c47adc85344bae8de1d683a97243dfd2c61cb..6a5128c7f02738f3567ea452dfae4f7162d5b19e 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh
@@ -42,8 +42,8 @@
   *  \ingroup module_mdlib
   */
  
-#ifndef _WARPLESS_SOURCE_
-/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
   * Note that this should precede the kernel_utils include.
   */
  #define USE_CJ_PREFETCH 1
@@ -150,14 +150,17 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
  
      if (tidxz == 0)
      {
-        /* Pre-load i-atom x and q into shared memory */
-        int ci = sci * c_numClPerSupercl + tidxj;
-        int ai = ci * c_clSize + tidxi;
+        for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+        {
+            /* Pre-load i-atom x and q into shared memory */
+            int ci = sci * c_numClPerSupercl + tidxj+i;
+            int ai = ci * c_clSize + tidxi;
  
-        /* We don't need q, but using float4 in shmem avoids bank conflicts */
-        float4 tmp = xq[ai];
-        float4 xi  = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
-        xib[tidxj * c_clSize + tidxi] = xi;
+            /* We don't need q, but using float4 in shmem avoids bank conflicts */
+            float4 tmp = xq[ai];
+            float4 xi  = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+            xib[(tidxj + i) * c_clSize + tidxi] = xi;
+        }
      }
      barrier(CLK_LOCAL_MEM_FENCE);
  
@@ -186,10 +189,6 @@ __kernel void nbnxn_kernel_prune_rolling_opencl
          }
  
          preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck);
-#if defined _WARPLESS_SOURCE_ && USE_CJ_PREFETCH
-        /* can't assume wavefront width, need to sync before we can consume cj4 from local memory */
-        barrier(CLK_LOCAL_MEM_FENCE);
-#endif
  
          if (imaskCheck)
          {
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh

index 63d13a3bc6151df0e6e389049446fa299a630b03..99ac7e1e306934b4d0f8e5b3d17b146cdd0c6285 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh
@@ -38,7 +38,7 @@
  #define CL_SIZE                 (NBNXN_GPU_CLUSTER_SIZE)
  #define NCL_PER_SUPERCL         (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
  
-#define WARP_SIZE  32
+#define WARP_SIZE  (CL_SIZE*CL_SIZE/2)
  
  #undef KERNEL_UTILS_INLINE
  #ifdef KERNEL_UTILS_INLINE
@@ -61,8 +61,16 @@ __constant sampler_t generic_sampler     = (CLK_NORMALIZED_COORDS_FALSE /* Natur
  
  #define __device__
  
+#if CL_SIZE == 8
  #define WARP_SIZE_LOG2  (5)
-#define CL_SIZE_LOG2    (3)  /* change this together with CL_SIZE !*/
+#define CL_SIZE_LOG2    (3)
+#elif CL_SIZE == 4
+#define WARP_SIZE_LOG2  (3)
+#define CL_SIZE_LOG2    (2)
+#else
+#error unsupported CL_SIZE
+#endif
+
  #define CL_SIZE_SQ      (CL_SIZE * CL_SIZE)
  #define FBUF_STRIDE     (CL_SIZE_SQ)
  
@@ -142,9 +150,9 @@ typedef struct {
  
  
  typedef struct {
-    unsigned int pair[32]; /* Topology exclusion interaction bits for one warp,
-                            * each unsigned has bitS for 4*8 i clusters
-                            */
+    unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
+                                           * each unsigned has bitS for 4*8 i clusters
+                                           */
  } nbnxn_excl_t;
  
  /*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
@@ -210,13 +218,14 @@ int loadCj(__local int        *sm_cjPreload,
      const int c_nbnxnGpuClusterpairSplit = 2;
      const int c_splitClSize              = c_clSize/c_nbnxnGpuClusterpairSplit;
  
-#if _NVIDIA_SOURCE_
+#if USE_CJ_PREFETCH
+#if defined _NVIDIA_SOURCE_
      int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
-#else // defined _WARPLESS_SOURCE_ || defined _AMD_SOURCE_
+#elif defined _AMD_SOURCE_
      int warpLoadOffset = 0;
+#else
+#error Not supported
  #endif
-
-#if USE_CJ_PREFETCH
      return sm_cjPreload[jm + warpLoadOffset];
  #else
      return gm_cj[jm];
@@ -583,6 +592,7 @@ void reduce_force_i_generic(__local float *f_buf, __global float *fout,
              (*fshift_buf) += f;
          }
      }
+    barrier(CLK_LOCAL_MEM_FENCE);
  }
  
  /*! Final i-force reduction; this implementation works only with power of two
@@ -610,6 +620,11 @@ void reduce_force_i_pow2(volatile __local float *f_buf, __global float *fout,
          }
          i >>= 1;
      }
+    /* needed because
+     * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+     * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+     */
+    barrier(CLK_LOCAL_MEM_FENCE);
  
      /* i == 1, last reduction step, writing to global mem */
      /* Split the reduction between the first 3 line threads
diff --git a/src/gromacs/mdlib/nbnxn_pairlist.h b/src/gromacs/mdlib/nbnxn_pairlist.h

index 4e064ec910a917aff85339cead883dfc71637da8..352c6791465edd4b5b13b50209ee23dbd048f7f3 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_pairlist.h
+++ b/src/gromacs/mdlib/nbnxn_pairlist.h
@@ -36,6 +36,8 @@
  #ifndef _nbnxn_pairlist_h
  #define _nbnxn_pairlist_h
  
+#include "config.h"
+
  #include <cstddef>
  
  #include "gromacs/math/vectypes.h"
@@ -74,8 +76,12 @@ struct NbnxnListParameters
  
  /*! \endcond */
  
-/* With GPU kernels the i and j cluster size is 8 atoms */
+/* With GPU kernels the i and j cluster size is 8 atoms for CUDA and can be set at compile time for OpenCL */
+#if GMX_GPU == GMX_GPU_OPENCL
+static constexpr int c_nbnxnGpuClusterSize = GMX_OCL_CLUSTER_SIZE;
+#else
  static constexpr int c_nbnxnGpuClusterSize = 8;
+#endif
  
  /* The number of clusters in a super-cluster, used for GPU */
  static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
diff --git a/src/gromacs/mdlib/nbnxn_search.cpp b/src/gromacs/mdlib/nbnxn_search.cpp

index 917319f9108a8c1e75ac01f5943824c7df3c4439..e9d990bd2265ceafa2cd1a223dc08cf44526b6d7 100644 (file)
--- a/src/gromacs/mdlib/nbnxn_search.cpp
+++ b/src/gromacs/mdlib/nbnxn_search.cpp
@@ -1986,7 +1986,8 @@ static void make_fep_list_supersub(const nbnxn_search_t    nbs,
                                      unsigned int  excl_bit;
                                      real          dx, dy, dz;
  
-                                    get_nbl_exclusions_1(nbl, cj4_ind, j>>2, &excl);
+                                    const int     jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit);
+                                    get_nbl_exclusions_1(nbl, cj4_ind, jHalf, &excl);
  
                                      excl_pair = a_mod_wj(j)*nbl->na_ci + i;
                                      excl_bit  = (1U << (gcj*c_gpuNumClusterPerCell + c));
author	Roland Schulz <roland.schulz@intel.com>
	Thu, 29 Mar 2018 01:55:52 +0000 (18:55 -0700)
committer	Roland Schulz <roland.schulz@intel.com>
	Tue, 15 May 2018 16:25:09 +0000 (09:25 -0700)
cmake/gmxManageOpenCL.cmake		patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/gromacs/gpu_utils/ocl_compiler.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_kernels/nbnxn_kernel_gpu_ref.cpp		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_pruneonly.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl_kernel_utils.clh		patch \| blob \| history
src/gromacs/mdlib/nbnxn_pairlist.h		patch \| blob \| history
src/gromacs/mdlib/nbnxn_search.cpp		patch \| blob \| history