Add GMX_OCL_CLUSTER_SIZE which can be set to 4 for e.g. Intel.
The kernel should now work on any HW with at least
CL_SIZE*CL_SIZE/2 wide sub-groups (warp-sync execution).
This is 8(/32) for CL_SIZE 4(/8). Not tested for CL_SIZE other
than 4 or 8.
Fixes:
- make_fep_list_supersub was incorrect for CL_SIZE!=8.
- reduce_force_i_pow2 was incorrect for CL_SIZE<8 and 2 warps.
- i-atom preload, nbnxn_excl_t, warp-any init for CL_SIZE!=8.
- gpu_ref for CL_SIZE!=8.
Change-Id: I1114e408d28b9eb6306722c41fd6a6ccec52211b
include_directories(SYSTEM ${OpenCL_INCLUDE_DIRS})
+set(GMX_OCL_CLUSTER_SIZE 8 CACHE STRING "Cluster size used by OpenCL kernel. Set to 4 for Intel GPUs.")
+mark_as_advanced(GMX_OCL_CLUSTER_SIZE)
+
macro(gmx_gpu_setup)
# no OpenMP is no good!
if(NOT GMX_OPENMP)
/* Use a single compilation unit when compiling the CUDA (non-bonded) kernels. */
#cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
+/* Cluster size used by OpenCL kernel. Should be 8 for NVIDIA/AMD and 4 for Intel */
+#define GMX_OCL_CLUSTER_SIZE @GMX_OCL_CLUSTER_SIZE@
+
/* Use NVML */
#cmakedefine01 HAVE_NVML
case OCL_VENDOR_NVIDIA:
choice = "-D_NVIDIA_SOURCE_";
break;
+ case OCL_VENDOR_INTEL:
+ choice = "-D_INTEL_SOURCE_";
+ break;
default:
- choice = "-D_WARPLESS_SOURCE_";
+ choice = "";
break;
}
return choice;
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2012,2013,2014,2015,2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016,2017,2018, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
continue;
}
- int_bit = ((excl[jc >> 2]->pair[(jc & 3)*c_clSize + ic] >> (jm*c_numClPerSupercl + im)) & 1);
+ constexpr int clusterPerSplit = c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit;
+ int_bit = ((excl[jc/clusterPerSplit]->pair[(jc & (clusterPerSplit - 1))*c_clSize + ic]
+ >> (jm*c_numClPerSupercl + im)) & 1);
js = ja*nbat->xstride;
jfs = ja*nbat->fstride;
cij4_start = nb_sci.cj4_ind_start; /* first ...*/
cij4_end = nb_sci.cj4_ind_end; /* and last index of j clusters */
- /* Pre-load i-atom x and q into shared memory */
- ci = sci * NCL_PER_SUPERCL + tidxj;
- ai = ci * CL_SIZE + tidxi;
-
- xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
- xqbuf.w *= nbparam->epsfac;
- xqib[tidxj * CL_SIZE + tidxi] = xqbuf;
+ for (i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+ {
+ /* Pre-load i-atom x and q into shared memory */
+ ci = sci * NCL_PER_SUPERCL + tidxj+i;
+ ai = ci * CL_SIZE + tidxi;
+ xqbuf = xq[ai] + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+ xqbuf.w *= nbparam->epsfac;
+ xqib[(tidxj + i) * CL_SIZE + tidxi] = xqbuf;
#ifdef IATYPE_SHMEM
#ifndef LJ_COMB
- /* Pre-load the i-atom types into shared memory */
- atib[tidxj * CL_SIZE + tidxi] = atom_types[ai];
+ /* Pre-load the i-atom types into shared memory */
+ atib[(tidxj + i) * CL_SIZE + tidxi] = atom_types[ai];
#else
- ljcpib[tidxj * CL_SIZE + tidxi] = lj_comb[ai];
+ ljcpib[(tidxj + i) * CL_SIZE + tidxi] = lj_comb[ai];
#endif
#endif
+ }
/* Initialise warp vote. (8x8 block) 2 warps for nvidia */
- if (tidx == 0 || tidx == 32)
+ if (tidx == 0 || tidx == WARP_SIZE)
{
warp_any[widx] = 0;
}
reduce_force_i(f_buf, f,
&fshift_buf, bCalcFshift,
tidxi, tidxj, ai);
- barrier(CLK_LOCAL_MEM_FENCE);
}
/* add up local shift forces into global mem */
f_buf[ tidx] = E_lj;
f_buf[FBUF_STRIDE + tidx] = E_el;
reduce_energy_pow2(f_buf + (tidx & WARP_SIZE), e_lj, e_el, tidx & ~WARP_SIZE);
-
#endif
}
* \ingroup module_mdlib
*/
-#ifndef _WARPLESS_SOURCE_
-/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for the "nowarp" kernel
+#if defined _NVIDIA_SOURCE_ || defined _AMD_SOURCE_
+/* Currently we enable CJ prefetch for AMD/NVIDIA and disable it for other vendors
* Note that this should precede the kernel_utils include.
*/
#define USE_CJ_PREFETCH 1
if (tidxz == 0)
{
- /* Pre-load i-atom x and q into shared memory */
- int ci = sci * c_numClPerSupercl + tidxj;
- int ai = ci * c_clSize + tidxi;
+ for (int i = 0; i < NCL_PER_SUPERCL; i += CL_SIZE)
+ {
+ /* Pre-load i-atom x and q into shared memory */
+ int ci = sci * c_numClPerSupercl + tidxj+i;
+ int ai = ci * c_clSize + tidxi;
- /* We don't need q, but using float4 in shmem avoids bank conflicts */
- float4 tmp = xq[ai];
- float4 xi = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
- xib[tidxj * c_clSize + tidxi] = xi;
+ /* We don't need q, but using float4 in shmem avoids bank conflicts */
+ float4 tmp = xq[ai];
+ float4 xi = tmp + (float4)(shift_vec[3 * nb_sci.shift], shift_vec[3 * nb_sci.shift + 1], shift_vec[3 * nb_sci.shift + 2], 0.0f);
+ xib[(tidxj + i) * c_clSize + tidxi] = xi;
+ }
}
barrier(CLK_LOCAL_MEM_FENCE);
}
preloadCj4(cjs, pl_cj4[j4].cj, tidxi, tidxj, imaskCheck);
-#if defined _WARPLESS_SOURCE_ && USE_CJ_PREFETCH
- /* can't assume wavefront width, need to sync before we can consume cj4 from local memory */
- barrier(CLK_LOCAL_MEM_FENCE);
-#endif
if (imaskCheck)
{
#define CL_SIZE (NBNXN_GPU_CLUSTER_SIZE)
#define NCL_PER_SUPERCL (NBNXN_GPU_NCLUSTER_PER_SUPERCLUSTER)
-#define WARP_SIZE 32
+#define WARP_SIZE (CL_SIZE*CL_SIZE/2)
#undef KERNEL_UTILS_INLINE
#ifdef KERNEL_UTILS_INLINE
#define __device__
+#if CL_SIZE == 8
#define WARP_SIZE_LOG2 (5)
-#define CL_SIZE_LOG2 (3) /* change this together with CL_SIZE !*/
+#define CL_SIZE_LOG2 (3)
+#elif CL_SIZE == 4
+#define WARP_SIZE_LOG2 (3)
+#define CL_SIZE_LOG2 (2)
+#else
+#error unsupported CL_SIZE
+#endif
+
#define CL_SIZE_SQ (CL_SIZE * CL_SIZE)
#define FBUF_STRIDE (CL_SIZE_SQ)
typedef struct {
- unsigned int pair[32]; /* Topology exclusion interaction bits for one warp,
- * each unsigned has bitS for 4*8 i clusters
- */
+ unsigned int pair[CL_SIZE*CL_SIZE/2]; /* Topology exclusion interaction bits for one warp,
+ * each unsigned has bitS for 4*8 i clusters
+ */
} nbnxn_excl_t;
/*! i-cluster interaction mask for a super-cluster with all NCL_PER_SUPERCL bits set */
const int c_nbnxnGpuClusterpairSplit = 2;
const int c_splitClSize = c_clSize/c_nbnxnGpuClusterpairSplit;
-#if _NVIDIA_SOURCE_
+#if USE_CJ_PREFETCH
+#if defined _NVIDIA_SOURCE_
int warpLoadOffset = (tidxj & 4) * c_nbnxnGpuJgroupSize/c_splitClSize;
-#else // defined _WARPLESS_SOURCE_ || defined _AMD_SOURCE_
+#elif defined _AMD_SOURCE_
int warpLoadOffset = 0;
+#else
+#error Not supported
#endif
-
-#if USE_CJ_PREFETCH
return sm_cjPreload[jm + warpLoadOffset];
#else
return gm_cj[jm];
(*fshift_buf) += f;
}
}
+ barrier(CLK_LOCAL_MEM_FENCE);
}
/*! Final i-force reduction; this implementation works only with power of two
}
i >>= 1;
}
+ /* needed because
+ * a) for CL_SIZE<8: id 2 (doing z in next block) is in 2nd warp
+ * b) for all CL_SIZE a barrier is needed before f_buf is reused by next reduce_force_i call
+ */
+ barrier(CLK_LOCAL_MEM_FENCE);
/* i == 1, last reduction step, writing to global mem */
/* Split the reduction between the first 3 line threads
#ifndef _nbnxn_pairlist_h
#define _nbnxn_pairlist_h
+#include "config.h"
+
#include <cstddef>
#include "gromacs/math/vectypes.h"
/*! \endcond */
-/* With GPU kernels the i and j cluster size is 8 atoms */
+/* With GPU kernels the i and j cluster size is 8 atoms for CUDA and can be set at compile time for OpenCL */
+#if GMX_GPU == GMX_GPU_OPENCL
+static constexpr int c_nbnxnGpuClusterSize = GMX_OCL_CLUSTER_SIZE;
+#else
static constexpr int c_nbnxnGpuClusterSize = 8;
+#endif
/* The number of clusters in a super-cluster, used for GPU */
static constexpr int c_nbnxnGpuNumClusterPerSupercluster = 8;
unsigned int excl_bit;
real dx, dy, dz;
- get_nbl_exclusions_1(nbl, cj4_ind, j>>2, &excl);
+ const int jHalf = j/(c_nbnxnGpuClusterSize/c_nbnxnGpuClusterpairSplit);
+ get_nbl_exclusions_1(nbl, cj4_ind, jHalf, &excl);
excl_pair = a_mod_wj(j)*nbl->na_ci + i;
excl_bit = (1U << (gcj*c_gpuNumClusterPerCell + c));