Ensure minimum exec width of the PME OpenCL kernels

[alexxy/gromacs.git] / src / gromacs / ewald / pme-gpu-program-impl-ocl.cpp
diff --git a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp

index ab605f18d8f8c09106e690b16b9e63dbb23cc95f..f2c00f1516d4b83fc345e4067a9579d91bb3e191 100644 (file)
--- a/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
+++ b/src/gromacs/ewald/pme-gpu-program-impl-ocl.cpp
@@ -73,7 +73,9 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo)
      }
  
      // kernel parameters
-    warpSize            = gmx::ocl::getWarpSize(context, deviceId);
+    warpSize            = gmx::ocl::getDeviceWarpSize(context, deviceId);
+    // TODO: for Intel ideally we'd want to set these based on the compiler warp size
+    // but given that we've done no tuning for Intel iGPU, this is as good as anything.
      spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize,
                                     deviceInfo->maxWorkGroupSize);
      solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize,
@@ -102,6 +104,31 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl()
                                                       stat, ocl_get_error_string(stat).c_str()).c_str());
  }
  
+/*! \brief Ensure that spread/gather kernels have been compiled to a suitable warp size
+ *
+ * On Intel the exec width/warp is decided at compile-time and can be
+ * smaller than the minimum order^2 required in spread/gather ATM which
+ * we need to check for.
+ */
+static void checkRequiredWarpSize(const cl_kernel          kernel,
+                                  const char*              kernelName,
+                                  const gmx_device_info_t *deviceInfo)
+{
+    if (deviceInfo->vendor_e == OCL_VENDOR_INTEL)
+    {
+        size_t kernelWarpSize = gmx::ocl::getKernelWarpSize(kernel, deviceInfo->ocl_gpu_id.ocl_device_id);
+
+        if (kernelWarpSize < c_pmeSpreadGatherMinWarpSize)
+        {
+            const std::string errorString = gmx::formatString("PME OpenCL kernels require >=%d execution width, but the %s kernel "
+                                                              "has been compiled for the device %s to a %zu width and therefore it can not execute correctly.",
+                                                              c_pmeSpreadGatherMinWarpSize, kernelName,
+                                                              deviceInfo->device_name, kernelWarpSize);
+            GMX_THROW(gmx::InternalError(errorString));
+        }
+    }
+}
+
  void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
  {
      // We might consider storing program as a member variable if it's needed later
@@ -201,18 +228,22 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
          {
              splineAndSpreadKernel = kernel;
+            checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
          {
              spreadKernel = kernel;
+            checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
          {
              gatherKernel = kernel;
+            checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
          {
              gatherReduceWithInputKernel = kernel;
+            checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
          }
          else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
          {