Ensure minimum exec width of the PME OpenCL kernels
[alexxy/gromacs.git] / src / gromacs / ewald / pme-gpu-program-impl-ocl.cpp
index ab605f18d8f8c09106e690b16b9e63dbb23cc95f..f2c00f1516d4b83fc345e4067a9579d91bb3e191 100644 (file)
@@ -73,7 +73,9 @@ PmeGpuProgramImpl::PmeGpuProgramImpl(const gmx_device_info_t *deviceInfo)
     }
 
     // kernel parameters
-    warpSize            = gmx::ocl::getWarpSize(context, deviceId);
+    warpSize            = gmx::ocl::getDeviceWarpSize(context, deviceId);
+    // TODO: for Intel ideally we'd want to set these based on the compiler warp size
+    // but given that we've done no tuning for Intel iGPU, this is as good as anything.
     spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize,
                                    deviceInfo->maxWorkGroupSize);
     solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize,
@@ -102,6 +104,31 @@ PmeGpuProgramImpl::~PmeGpuProgramImpl()
                                                      stat, ocl_get_error_string(stat).c_str()).c_str());
 }
 
+/*! \brief Ensure that spread/gather kernels have been compiled to a suitable warp size
+ *
+ * On Intel the exec width/warp is decided at compile-time and can be
+ * smaller than the minimum order^2 required in spread/gather ATM which
+ * we need to check for.
+ */
+static void checkRequiredWarpSize(const cl_kernel          kernel,
+                                  const char*              kernelName,
+                                  const gmx_device_info_t *deviceInfo)
+{
+    if (deviceInfo->vendor_e == OCL_VENDOR_INTEL)
+    {
+        size_t kernelWarpSize = gmx::ocl::getKernelWarpSize(kernel, deviceInfo->ocl_gpu_id.ocl_device_id);
+
+        if (kernelWarpSize < c_pmeSpreadGatherMinWarpSize)
+        {
+            const std::string errorString = gmx::formatString("PME OpenCL kernels require >=%d execution width, but the %s kernel "
+                                                              "has been compiled for the device %s to a %zu width and therefore it can not execute correctly.",
+                                                              c_pmeSpreadGatherMinWarpSize, kernelName,
+                                                              deviceInfo->device_name, kernelWarpSize);
+            GMX_THROW(gmx::InternalError(errorString));
+        }
+    }
+}
+
 void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
 {
     // We might consider storing program as a member variable if it's needed later
@@ -201,18 +228,22 @@ void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
         {
             splineAndSpreadKernel = kernel;
+            checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
         {
             spreadKernel = kernel;
+            checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
         {
             gatherKernel = kernel;
+            checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
         {
             gatherReduceWithInputKernel = kernel;
+            checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
         }
         else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
         {