From 691f1d0e45f14973c466dc27d32d901358679139 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Szil=C3=A1rd=20P=C3=A1ll?= <pall.szilard@gmail.com>
Date: Wed, 31 Oct 2018 21:04:02 +0100
Subject: [PATCH] Ensure minimum exec width of the PME OpenCL kernels

This change adds checks to make sure that we don't execute incorrect
kernels in the case of the rare event if the Intel OpenCL compiler
decides to generate spread or gather kernels for 8-wide execution.

Refs #2516 #2520

Change-Id: I7ab33accebe908a56eb194e8245dfcfa6f817324
---
 src/gromacs/ewald/pme-gpu-constants.h         |  7 ++++
 .../ewald/pme-gpu-program-impl-ocl.cpp        | 33 ++++++++++++++++++-
 src/gromacs/gpu_utils/ocl_compiler.cpp        | 32 ++++++++++--------
 src/gromacs/gpu_utils/ocl_compiler.h          | 15 +++++++--
 4 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/src/gromacs/ewald/pme-gpu-constants.h b/src/gromacs/ewald/pme-gpu-constants.h
index 50accc397e..5f66f09d11 100644
--- a/src/gromacs/ewald/pme-gpu-constants.h
+++ b/src/gromacs/ewald/pme-gpu-constants.h
@@ -121,6 +121,13 @@ constexpr int c_virialAndEnergyCount = 7;
  */
 #define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
 
+/*! \brief Minimum execution width of the PME spread and gather kernels.
+ *
+ * Due to the one thread per atom and order=4 implementation constraints, order^2 threads
+ * should execute without synchronization needed. See PME_SPREADGATHER_THREADS_PER_ATOM
+ */
+constexpr int c_pmeSpreadGatherMinWarpSize = 16;
+
 /*! \brief
  * Atom data alignment (in terms of number of atoms).
  * This is the least common multiple of number of atoms processed by
diff --git a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
index ab605f18d8..f2c00f1516 100644
--- a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
+++ b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
@@ -73,7 +73,9 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo)
     }
 
     // kernel parameters
-    warpSize            = gmx::ocl::getWarpSize(context, deviceId);
+    warpSize            = gmx::ocl::getDeviceWarpSize(context, deviceId);
+    // TODO: for Intel ideally we'd want to set these based on the compiler warp size
+    // but given that we've done no tuning for Intel iGPU, this is as good as anything.
     spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize,
                                    deviceInfo->maxWorkGroupSize);
     solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize,
@@ -102,6 +104,31 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl()
                                                      stat, ocl_get_error_string(stat).c_str()).c_str());
 }
 
+/*! \brief Ensure that spread/gather kernels have been compiled to a suitable warp size
+ *
+ * On Intel the exec width/warp is decided at compile-time and can be
+ * smaller than the minimum order^2 required in spread/gather ATM which
+ * we need to check for.
+ */
+static void checkRequiredWarpSize(const cl_kernel          kernel,
+                                  const char*              kernelName,
+                                  const gmx_device_info_t *deviceInfo)
+{
+    if (deviceInfo->vendor_e == OCL_VENDOR_INTEL)
+    {
+        size_t kernelWarpSize = gmx::ocl::getKernelWarpSize(kernel, deviceInfo->ocl_gpu_id.ocl_device_id);
+
+        if (kernelWarpSize < c_pmeSpreadGatherMinWarpSize)
+        {
+            const std::string errorString = gmx::formatString("PME OpenCL kernels require >=%d execution width, but the %s kernel "
+                                                              "has been compiled for the device %s to a %zu width and therefore it can not execute correctly.",
+                                                              c_pmeSpreadGatherMinWarpSize, kernelName,
+                                                              deviceInfo->device_name, kernelWarpSize);
+            GMX_THROW(gmx::InternalError(errorString));
+        }
+    }
+}
+
 void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
 {
     // We might consider storing program as a member variable if it's needed later
@@ -201,18 +228,22 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
         {
             splineAndSpreadKernel = kernel;
+            checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
         {
             spreadKernel = kernel;
+            checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
         {
             gatherKernel = kernel;
+            checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
         {
             gatherReduceWithInputKernel = kernel;
+            checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
         {
diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp
index e3e8b1e8c5..4e8866923d 100644
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -253,7 +253,23 @@ getSourceRootPath(const std::string &sourceRelativePath)
     return Path::normalize(sourceRootPath);
 }
 
-size_t getWarpSize(cl_context context, cl_device_id deviceId)
+size_t getKernelWarpSize(cl_kernel kernel, cl_device_id deviceId)
+{
+    size_t warpSize = 0;
+    cl_int cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                                               sizeof(warpSize), &warpSize, nullptr);
+    if (cl_error != CL_SUCCESS)
+    {
+        GMX_THROW(InternalError("Could not query OpenCL preferred workgroup size, error was " + ocl_get_error_string(cl_error)));
+    }
+    if (warpSize == 0)
+    {
+        GMX_THROW(InternalError(formatString("Invalid OpenCL warp size encountered")));
+    }
+    return warpSize;
+}
+
+size_t getDeviceWarpSize(cl_context context, cl_device_id deviceId)
 {
     cl_int      cl_error;
     const char *warpSizeKernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
@@ -275,17 +291,7 @@ size_t getWarpSize(cl_context context, cl_device_id deviceId)
         GMX_THROW(InternalError("Could not create OpenCL kernel to determine warp size, error was " + ocl_get_error_string(cl_error)));
     }
 
-    size_t warpSize = 0;
-    cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
-                                        sizeof(warpSize), &warpSize, nullptr);
-    if (cl_error != CL_SUCCESS)
-    {
-        GMX_THROW(InternalError("Could not measure OpenCL warp size, error was " + ocl_get_error_string(cl_error)));
-    }
-    if (warpSize == 0)
-    {
-        GMX_THROW(InternalError(formatString("Did not measure a valid OpenCL warp size")));
-    }
+    size_t warpSize = getKernelWarpSize(kernel, deviceId);
 
     cl_error = clReleaseKernel(kernel);
     if (cl_error != CL_SUCCESS)
@@ -431,7 +437,7 @@ compileProgram(FILE              *fplog,
     /* Make the build options */
     std::string preprocessorOptions = makePreprocessorOptions(kernelRootPath,
                                                               includeRootPath,
-                                                              getWarpSize(context, deviceId),
+                                                              getDeviceWarpSize(context, deviceId),
                                                               deviceVendorId,
                                                               extraDefines);
 
diff --git a/src/gromacs/gpu_utils/ocl_compiler.h b/src/gromacs/gpu_utils/ocl_compiler.h
index 94dbe02ccb..a155d1c695 100644
--- a/src/gromacs/gpu_utils/ocl_compiler.h
+++ b/src/gromacs/gpu_utils/ocl_compiler.h
@@ -53,7 +53,7 @@ namespace gmx
 namespace ocl
 {
 
-/*! \brief Get the warp size reported by device
+/*! \brief Get the device-specific warp size
  *
  *  This is platform implementation dependent and seems to only work on the Nvidia and AMD platforms!
  *  Nvidia reports 32, AMD for GPU 64. Intel seems to report 16, but that is not correct,
@@ -66,7 +66,18 @@ namespace ocl
  *
  * \throws InternalError if an OpenCL error was encountered
  */
-size_t getWarpSize(cl_context context, cl_device_id deviceId);
+size_t getDeviceWarpSize(cl_context context, cl_device_id deviceId);
+
+
+/*! \brief Get the kernel-specific warp size
+ *
+ *  \param  kernel   THe OpenCL kernel object
+ *  \param  deviceId OpenCL device for which the kernel warp size is queried
+ *  \return cl_int value of the warp size
+ *
+ * \throws InternalError if an OpenCL error was encountered
+ */
+size_t getKernelWarpSize(cl_kernel kernel, cl_device_id deviceId);
 
 /*! \brief Compile the specified kernel for the context and device.
  *
-- 
2.22.0