*/
#define PME_SPREADGATHER_THREADS_PER_ATOM (order * order)
+/*! \brief Minimum execution width of the PME spread and gather kernels.
+ *
+ * Due to the one thread per atom and order=4 implementation constraints, order^2 threads
+ * should execute without synchronization needed. See PME_SPREADGATHER_THREADS_PER_ATOM
+ */
+constexpr int c_pmeSpreadGatherMinWarpSize = 16;
+
/*! \brief
* Atom data alignment (in terms of number of atoms).
* This is the least common multiple of number of atoms processed by
}
// kernel parameters
- warpSize = gmx::ocl::getWarpSize(context, deviceId);
+ warpSize = gmx::ocl::getDeviceWarpSize(context, deviceId);
+ // TODO: for Intel ideally we'd want to set these based on the compiler warp size
+ // but given that we've done no tuning for Intel iGPU, this is as good as anything.
spreadWorkGroupSize = std::min(c_spreadMaxWarpsPerBlock * warpSize,
deviceInfo->maxWorkGroupSize);
solveMaxWorkGroupSize = std::min(c_solveMaxWarpsPerBlock * warpSize,
stat, ocl_get_error_string(stat).c_str()).c_str());
}
+/*! \brief Ensure that spread/gather kernels have been compiled to a suitable warp size
+ *
+ * On Intel the exec width/warp is decided at compile-time and can be
+ * smaller than the minimum order^2 required in spread/gather ATM which
+ * we need to check for.
+ */
+static void checkRequiredWarpSize(const cl_kernel kernel,
+ const char* kernelName,
+ const gmx_device_info_t *deviceInfo)
+{
+ if (deviceInfo->vendor_e == OCL_VENDOR_INTEL)
+ {
+ size_t kernelWarpSize = gmx::ocl::getKernelWarpSize(kernel, deviceInfo->ocl_gpu_id.ocl_device_id);
+
+ if (kernelWarpSize < c_pmeSpreadGatherMinWarpSize)
+ {
+ const std::string errorString = gmx::formatString("PME OpenCL kernels require >=%d execution width, but the %s kernel "
+ "has been compiled for the device %s to a %zu width and therefore it can not execute correctly.",
+ c_pmeSpreadGatherMinWarpSize, kernelName,
+ deviceInfo->device_name, kernelWarpSize);
+ GMX_THROW(gmx::InternalError(errorString));
+ }
+ }
+}
+
void PmeGpuProgramImpl::compileKernels(const gmx_device_info_t *deviceInfo)
{
// We might consider storing program as a member variable if it's needed later
else if (!strcmp(kernelNamesBuffer.data(), "pmeSplineAndSpreadKernel"))
{
splineAndSpreadKernel = kernel;
+ checkRequiredWarpSize(splineAndSpreadKernel, kernelNamesBuffer.data(), deviceInfo);
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeSpreadKernel"))
{
spreadKernel = kernel;
+ checkRequiredWarpSize(spreadKernel, kernelNamesBuffer.data(), deviceInfo);
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherKernel"))
{
gatherKernel = kernel;
+ checkRequiredWarpSize(gatherKernel, kernelNamesBuffer.data(), deviceInfo);
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeGatherReduceWithInputKernel"))
{
gatherReduceWithInputKernel = kernel;
+ checkRequiredWarpSize(gatherReduceWithInputKernel, kernelNamesBuffer.data(), deviceInfo);
}
else if (!strcmp(kernelNamesBuffer.data(), "pmeSolveYZXKernel"))
{
return Path::normalize(sourceRootPath);
}
-size_t getWarpSize(cl_context context, cl_device_id deviceId)
+size_t getKernelWarpSize(cl_kernel kernel, cl_device_id deviceId)
+{
+ size_t warpSize = 0;
+ cl_int cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
+ sizeof(warpSize), &warpSize, nullptr);
+ if (cl_error != CL_SUCCESS)
+ {
+ GMX_THROW(InternalError("Could not query OpenCL preferred workgroup size, error was " + ocl_get_error_string(cl_error)));
+ }
+ if (warpSize == 0)
+ {
+ GMX_THROW(InternalError(formatString("Invalid OpenCL warp size encountered")));
+ }
+ return warpSize;
+}
+
+size_t getDeviceWarpSize(cl_context context, cl_device_id deviceId)
{
cl_int cl_error;
const char *warpSizeKernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}";
GMX_THROW(InternalError("Could not create OpenCL kernel to determine warp size, error was " + ocl_get_error_string(cl_error)));
}
- size_t warpSize = 0;
- cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
- sizeof(warpSize), &warpSize, nullptr);
- if (cl_error != CL_SUCCESS)
- {
- GMX_THROW(InternalError("Could not measure OpenCL warp size, error was " + ocl_get_error_string(cl_error)));
- }
- if (warpSize == 0)
- {
- GMX_THROW(InternalError(formatString("Did not measure a valid OpenCL warp size")));
- }
+ size_t warpSize = getKernelWarpSize(kernel, deviceId);
cl_error = clReleaseKernel(kernel);
if (cl_error != CL_SUCCESS)
/* Make the build options */
std::string preprocessorOptions = makePreprocessorOptions(kernelRootPath,
includeRootPath,
- getWarpSize(context, deviceId),
+ getDeviceWarpSize(context, deviceId),
deviceVendorId,
extraDefines);
namespace ocl
{
-/*! \brief Get the warp size reported by device
+/*! \brief Get the device-specific warp size
*
* This is platform implementation dependent and seems to only work on the Nvidia and AMD platforms!
* Nvidia reports 32, AMD for GPU 64. Intel seems to report 16, but that is not correct,
*
* \throws InternalError if an OpenCL error was encountered
*/
-size_t getWarpSize(cl_context context, cl_device_id deviceId);
+size_t getDeviceWarpSize(cl_context context, cl_device_id deviceId);
+
+
+/*! \brief Get the kernel-specific warp size
+ *
+ * \param kernel THe OpenCL kernel object
+ * \param deviceId OpenCL device for which the kernel warp size is queried
+ * \return cl_int value of the warp size
+ *
+ * \throws InternalError if an OpenCL error was encountered
+ */
+size_t getKernelWarpSize(cl_kernel kernel, cl_device_id deviceId);
/*! \brief Compile the specified kernel for the context and device.
*