Ensure minimum exec width of the PME OpenCL kernels
authorSzilárd Páll <pall.szilard@gmail.com>
Wed, 31 Oct 2018 20:04:02 +0000 (21:04 +0100)
committerMark Abraham <mark.j.abraham@gmail.com>
Wed, 7 Nov 2018 11:55:42 +0000 (12:55 +0100)
This change adds checks to make sure that we don't execute incorrect
kernels in the case of the rare event if the Intel OpenCL compiler
decides to generate spread or gather kernels for 8-wide execution.

Refs #2516 #2520

Change-Id: I7ab33accebe908a56eb194e8245dfcfa6f817324

src/gromacs/ewald/pme-gpu-constants.h
src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
src/gromacs/gpu_utils/ocl_compiler.cpp
src/gromacs/gpu_utils/ocl_compiler.h

index 50accc397ef888ca9b4981bc46333f2cb9b52726..5f66f09d11f67784b4b9871e5667c5d2a44316d5 100644 (file)
@@ -121,6 +121,13 @@ constexpr int c_virialAndEnergyCount = 7;
  */
 #define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
 
+/*! \brief Minimum execution width of the PME spread and gather kernels.
+ *
+ * Due to the one thread per atom and order=4 implementation constraints, order^2 threads
+ * should execute without synchronization needed. See PME_SPREADGATHER_THREADS_PER_ATOM
+ */
+constexpr int c_pmeSpreadGatherMinWarpSize = 16;
+
 /*! \brief
  * Atom data alignment (in terms of number of atoms).
  * This is the least common multiple of number of atoms processed by
index ab605f18d8f8c09106e690b16b9e63dbb23cc95f..f2c00f1516d4b83fc345e4067a9579d91bb3e191 100644 (file)
@@ -73,7 +73,9 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo)
     }
 
     // kernel parameters
-    warpSize            = gmx::ocl::getWarpSize(context, deviceId);
+    warpSize            = gmx::ocl::getDeviceWarpSize(context, deviceId);
+    // TODO: for Intel ideally we'd want to set these based on the compiler warp size
+    // but given that we've done no tuning for Intel iGPU, this is as good as anything.
     spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize,
                                    deviceInfo->maxWorkGroupSize);
     solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize,
@@ -102,6 +104,31 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl()
                                                      stat, ocl_get_error_string(stat).c_str()).c_str());
 }
 
+/*! \brief Ensure that spread/gather kernels have been compiled to a suitable warp size
+ *
+ * On Intel the exec width/warp is decided at compile-time and can be
+ * smaller than the minimum order^2 required in spread/gather ATM which
+ * we need to check for.
+ */
+static void checkRequiredWarpSize(const cl_kernel          kernel,
+                                  const char*              kernelName,
+                                  const gmx_device_info_t *deviceInfo)
+{
+    if (deviceInfo->vendor_e == OCL_VENDOR_INTEL)
+    {
+        size_t kernelWarpSize = gmx::ocl::getKernelWarpSize(kernel, deviceInfo->ocl_gpu_id.ocl_device_id);
+
+        if (kernelWarpSize < c_pmeSpreadGatherMinWarpSize)
+        {
+            const std::string errorString = gmx::formatString("PME OpenCL kernels require >=%d execution width, but the %s kernel "
+                                                              "has been compiled for the device %s to a %zu width and therefore it can not execute correctly.",
+                                                              c_pmeSpreadGatherMinWarpSize, kernelName,
+                                                              deviceInfo->device_name, kernelWarpSize);
+            GMX_THROW(gmx::InternalError(errorString));
+        }
+    }
+}
+
 void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
 {
     // We might consider storing program as a member variable if it's needed later
@@ -201,18 +228,22 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
         {
             splineAndSpreadKernel = kernel;
+            checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
         {
             spreadKernel = kernel;
+            checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
         {
             gatherKernel = kernel;
+            checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
         {
             gatherReduceWithInputKernel = kernel;
+            checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
         {
index e3e8b1e8c5424beb273566927bc3132d0f1be56a..4e8866923deea7fc8836ed4a64b408170a238ad5 100644 (file)
@@ -253,7 +253,23 @@ getSourceRootPath(const std::string &sourceRelativePath)
     return Path::normalize(sourceRootPath);
 }
 
-size_t getWarpSize(cl_context context, cl_device_id deviceId)
+size_t getKernelWarpSize(cl_kernel kernel, cl_device_id deviceId)
+{
+    size_t warpSize = 0;
+    cl_int cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                                               sizeof(warpSize), &warpSize, nullptr);
+    if (cl_error != CL_SUCCESS)
+    {
+        GMX_THROW(InternalError("Could not query OpenCL preferred workgroup size, error was " + ocl_get_error_string(cl_error)));
+    }
+    if (warpSize == 0)
+    {
+        GMX_THROW(InternalError(formatString("Invalid OpenCL warp size encountered")));
+    }
+    return warpSize;
+}
+
+size_t getDeviceWarpSize(cl_context context, cl_device_id deviceId)
 {
     cl_int      cl_error;
     const char *warpSizeKernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
@@ -275,17 +291,7 @@ size_t getWarpSize(cl_context context, cl_device_id deviceId)
         GMX_THROW(InternalError("Could not create OpenCL kernel to determine warp size, error was " + ocl_get_error_string(cl_error)));
     }
 
-    size_t warpSize = 0;
-    cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
-                                        sizeof(warpSize), &warpSize, nullptr);
-    if (cl_error != CL_SUCCESS)
-    {
-        GMX_THROW(InternalError("Could not measure OpenCL warp size, error was " + ocl_get_error_string(cl_error)));
-    }
-    if (warpSize == 0)
-    {
-        GMX_THROW(InternalError(formatString("Did not measure a valid OpenCL warp size")));
-    }
+    size_t warpSize = getKernelWarpSize(kernel, deviceId);
 
     cl_error = clReleaseKernel(kernel);
     if (cl_error != CL_SUCCESS)
@@ -431,7 +437,7 @@ compileProgram(FILE              *fplog,
     /* Make the build options */
     std::string preprocessorOptions = makePreprocessorOptions(kernelRootPath,
                                                               includeRootPath,
-                                                              getWarpSize(context, deviceId),
+                                                              getDeviceWarpSize(context, deviceId),
                                                               deviceVendorId,
                                                               extraDefines);
 
index 94dbe02ccba30e490bb83b583586073eff770421..a155d1c695903fa1db4f694eac930a2e2bb6790e 100644 (file)
@@ -53,7 +53,7 @@ namespace gmx
 namespace ocl
 {
 
-/*! \brief Get the warp size reported by device
+/*! \brief Get the device-specific warp size
  *
  *  This is platform implementation dependent and seems to only work on the Nvidia and AMD platforms!
  *  Nvidia reports 32, AMD for GPU 64. Intel seems to report 16, but that is not correct,
@@ -66,7 +66,18 @@ namespace ocl
  *
  * \throws InternalError if an OpenCL error was encountered
  */
-size_t getWarpSize(cl_context context, cl_device_id deviceId);
+size_t getDeviceWarpSize(cl_context context, cl_device_id deviceId);
+
+
+/*! \brief Get the kernel-specific warp size
+ *
+ *  \param  kernel   THe OpenCL kernel object
+ *  \param  deviceId OpenCL device for which the kernel warp size is queried
+ *  \return cl_int value of the warp size
+ *
+ * \throws InternalError if an OpenCL error was encountered
+ */
+size_t getKernelWarpSize(cl_kernel kernel, cl_device_id deviceId);
 
 /*! \brief Compile the specified kernel for the context and device.
  *