Ensure minimum exec width of the PME OpenCL kernels

author Szilárd Páll <pall.szilard@gmail.com>

Wed, 31 Oct 2018 20:04:02 +0000 (21:04 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Wed, 7 Nov 2018 11:55:42 +0000 (12:55 +0100)
author Szilárd Páll <pall.szilard@gmail.com>
Wed, 31 Oct 2018 20:04:02 +0000 (21:04 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Wed, 7 Nov 2018 11:55:42 +0000 (12:55 +0100)
diff --git a/src/gromacs/ewald/pme-gpu-constants.h b/src/gromacs/ewald/pme-gpu-constants.h

index 50accc397ef888ca9b4981bc46333f2cb9b52726..5f66f09d11f67784b4b9871e5667c5d2a44316d5 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-constants.h
+++ b/src/gromacs/ewald/pme-gpu-constants.h
@@ -121,6 +121,13 @@ constexpr int c_virialAndEnergyCount = 7;
   */
  #define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
  
+/*! \brief Minimum execution width of the PME spread and gather kernels.
+ *
+ * Due to the one thread per atom and order=4 implementation constraints, order^2 threads
+ * should execute without synchronization needed. See PME_SPREADGATHER_THREADS_PER_ATOM
+ */
+constexpr int c_pmeSpreadGatherMinWarpSize = 16;
+
  /*! \brief
   * Atom data alignment (in terms of number of atoms).
   * This is the least common multiple of number of atoms processed by
diff --git a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp

index ab605f18d8f8c09106e690b16b9e63dbb23cc95f..f2c00f1516d4b83fc345e4067a9579d91bb3e191 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
+++ b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
@@ -73,7 +73,9 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo)
      }
  
      // kernel parameters
-    warpSize            = gmx::ocl::getWarpSize(context, deviceId);
+    warpSize            = gmx::ocl::getDeviceWarpSize(context, deviceId);
+    // TODO: for Intel ideally we'd want to set these based on the compiler warp size
+    // but given that we've done no tuning for Intel iGPU, this is as good as anything.
      spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize,
                                     deviceInfo->maxWorkGroupSize);
      solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize,
@@ -102,6 +104,31 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl()
                                                       stat, ocl_get_error_string(stat).c_str()).c_str());
  }
  
+/*! \brief Ensure that spread/gather kernels have been compiled to a suitable warp size
+ *
+ * On Intel the exec width/warp is decided at compile-time and can be
+ * smaller than the minimum order^2 required in spread/gather ATM which
+ * we need to check for.
+ */
+static void checkRequiredWarpSize(const cl_kernel          kernel,
+                                  const char*              kernelName,
+                                  const gmx_device_info_t *deviceInfo)
+{
+    if (deviceInfo->vendor_e == OCL_VENDOR_INTEL)
+    {
+        size_t kernelWarpSize = gmx::ocl::getKernelWarpSize(kernel, deviceInfo->ocl_gpu_id.ocl_device_id);
+
+        if (kernelWarpSize < c_pmeSpreadGatherMinWarpSize)
+        {
+            const std::string errorString = gmx::formatString("PME OpenCL kernels require >=%d execution width, but the %s kernel "
+                                                              "has been compiled for the device %s to a %zu width and therefore it can not execute correctly.",
+                                                              c_pmeSpreadGatherMinWarpSize, kernelName,
+                                                              deviceInfo->device_name, kernelWarpSize);
+            GMX_THROW(gmx::InternalError(errorString));
+        }
+    }
+}
+
  void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
  {
      // We might consider storing program as a member variable if it's needed later
@@ -201,18 +228,22 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
          {
              splineAndSpreadKernel = kernel;
+            checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
          {
              spreadKernel = kernel;
+            checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
          {
              gatherKernel = kernel;
+            checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
          {
              gatherReduceWithInputKernel = kernel;
+            checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
          {
diff --git a/src/gromacs/gpu_utils/ocl_compiler.cpp b/src/gromacs/gpu_utils/ocl_compiler.cpp

index e3e8b1e8c5424beb273566927bc3132d0f1be56a..4e8866923deea7fc8836ed4a64b408170a238ad5 100644 (file)
--- a/src/gromacs/gpu_utils/ocl_compiler.cpp
+++ b/src/gromacs/gpu_utils/ocl_compiler.cpp
@@ -253,7 +253,23 @@ getSourceRootPath(const std::string &sourceRelativePath)
      return Path::normalize(sourceRootPath);
  }
  
-size_t getWarpSize(cl_context context, cl_device_id deviceId)
+size_t getKernelWarpSize(cl_kernel kernel, cl_device_id deviceId)
+{
+    size_t warpSize = 0;
+    cl_int cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+                                               sizeof(warpSize), &warpSize, nullptr);
+    if (cl_error != CL_SUCCESS)
+    {
+        GMX_THROW(InternalError("Could not query OpenCL preferred workgroup size, error was " + ocl_get_error_string(cl_error)));
+    }
+    if (warpSize == 0)
+    {
+        GMX_THROW(InternalError(formatString("Invalid OpenCL warp size encountered")));
+    }
+    return warpSize;
+}
+
+size_t getDeviceWarpSize(cl_context context, cl_device_id deviceId)
  {
      cl_int      cl_error;
      const char *warpSizeKernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
@@ -275,17 +291,7 @@ size_t getWarpSize(cl_context context, cl_device_id deviceId)
          GMX_THROW(InternalError("Could not create OpenCL kernel to determine warp size, error was " + ocl_get_error_string(cl_error)));
      }
  
-    size_t warpSize = 0;
-    cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
-                                        sizeof(warpSize), &warpSize, nullptr);
-    if (cl_error != CL_SUCCESS)
-    {
-        GMX_THROW(InternalError("Could not measure OpenCL warp size, error was " + ocl_get_error_string(cl_error)));
-    }
-    if (warpSize == 0)
-    {
-        GMX_THROW(InternalError(formatString("Did not measure a valid OpenCL warp size")));
-    }
+    size_t warpSize = getKernelWarpSize(kernel, deviceId);
  
      cl_error = clReleaseKernel(kernel);
      if (cl_error != CL_SUCCESS)
@@ -431,7 +437,7 @@ compileProgram(FILE              *fplog,
      /* Make the build options */
      std::string preprocessorOptions = makePreprocessorOptions(kernelRootPath,
                                                                includeRootPath,
-                                                              getWarpSize(context, deviceId),
+                                                              getDeviceWarpSize(context, deviceId),
                                                                deviceVendorId,
                                                                extraDefines);
  
diff --git a/src/gromacs/gpu_utils/ocl_compiler.h b/src/gromacs/gpu_utils/ocl_compiler.h

index 94dbe02ccba30e490bb83b583586073eff770421..a155d1c695903fa1db4f694eac930a2e2bb6790e 100644 (file)
--- a/src/gromacs/gpu_utils/ocl_compiler.h
+++ b/src/gromacs/gpu_utils/ocl_compiler.h
@@ -53,7 +53,7 @@ namespace gmx
  namespace ocl
  {
  
-/*! \brief Get the warp size reported by device
+/*! \brief Get the device-specific warp size
   *
   *  This is platform implementation dependent and seems to only work on the Nvidia and AMD platforms!
   *  Nvidia reports 32, AMD for GPU 64. Intel seems to report 16, but that is not correct,
@@ -66,7 +66,18 @@ namespace ocl
   *
   * \throws InternalError if an OpenCL error was encountered
   */
-size_t getWarpSize(cl_context context, cl_device_id deviceId);
+size_t getDeviceWarpSize(cl_context context, cl_device_id deviceId);
+
+
+/*! \brief Get the kernel-specific warp size
+ *
+ *  \param  kernel   THe OpenCL kernel object
+ *  \param  deviceId OpenCL device for which the kernel warp size is queried
+ *  \return cl_int value of the warp size
+ *
+ * \throws InternalError if an OpenCL error was encountered
+ */
+size_t getKernelWarpSize(cl_kernel kernel, cl_device_id deviceId);
  
  /*! \brief Compile the specified kernel for the context and device.
   *
author	Szilárd Páll <pall.szilard@gmail.com>
	Wed, 31 Oct 2018 20:04:02 +0000 (21:04 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Wed, 7 Nov 2018 11:55:42 +0000 (12:55 +0100)
src/gromacs/ewald/pme-gpu-constants.h		patch \| blob \| history
src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp		patch \| blob \| history
src/gromacs/gpu_utils/ocl_compiler.cpp		patch \| blob \| history
src/gromacs/gpu_utils/ocl_compiler.h		patch \| blob \| history