Support for hardware detection and related heuristics has been implemented
for the Hygon Dhyana derived from the first-gen AMD Zen which it shares most
of its architectural details with.
+
+Enabled PME offload support with OpenCL on NVIDIA and Intel GPUs
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+Thanks to portability improvements, the previously disabled PME OpenCL offload
+is now enabled also on NVIDIA and Intel GPUs.
- On NVIDIA GPUs the OpenCL kernels achieve much lower performance
than the equivalent CUDA kernels due to limitations of the NVIDIA OpenCL
compiler.
-- PME is currently only supported on AMD devices, because of known
- issues with devices from other vendors
Limitations of interest to |Gromacs| developers:
-- The current implementation is not compatible with OpenCL devices that are
- not using warp/wavefronts or for which the warp/wavefront size is not a
- multiple of 32
+- The current implementation requires a minimum execution with of 16; kernels
+ compiled for narrower execution width (be it due to hardware requirements or
+ compiler choice) will not be suitable and will trigger a runtime error.
Performance checklist
---------------------
return addMessageIfNotSupported(errorReasons, error);
}
-bool pme_gpu_supports_hardware(const gmx_hw_info_t &hwinfo,
- std::string *error)
+bool pme_gpu_supports_hardware(const gmx_hw_info_t gmx_unused &hwinfo,
+ std::string *error)
{
std::list<std::string> errorReasons;
+
if (GMX_GPU == GMX_GPU_OPENCL)
{
- if (!areAllGpuDevicesFromAmd(hwinfo.gpu_info))
- {
- errorReasons.emplace_back("non-AMD devices");
- }
#ifdef __APPLE__
errorReasons.emplace_back("Apple OS X operating system");
#endif
int elementIndex = smemReserved + lineIndex;
// Store input force contributions
sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
-
-#if !defined(_AMD_SOURCE_)
- /* This barrier was not needed in CUDA, nor is it needed on AMD GPUs.
- * Different OpenCL compilers might have different ideas
- * about #pragma unroll, though. OpenCL 2 has _attribute__((opencl_unroll_hint)).
- * #2519
- */
+#if (warp_size < 48)
+ // sync here when exec width is smaller than the size of the sm_forceReduction
+ // buffer flushed to local mem above (size 3*16) as different warps will consume
+ // the data below.
barrier(CLK_LOCAL_MEM_FENCE);
#endif
const int packedIndex = atomIndexLocal * redStride + splineIndex;
sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
}
+
+ // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD)
+#if (warp_size < 64)
+ barrier(CLK_LOCAL_MEM_FENCE);
+#endif
}
+#if (warp_size >= 64)
barrier(CLK_LOCAL_MEM_FENCE);
+#endif
assert ((blockSize / warp_size) >= DIM);
// First 3 warps can now process 1 dimension each
if (dimIndex < DIM)
{
- int sourceIndex = lineIndex % warp_size;
+ const int sourceIndex = lineIndex % warp_size;
#pragma unroll
for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
{
}
}
- const float n = read_grid_size(realGridSizeFP, dimIndex);
-
+ const float n = read_grid_size(realGridSizeFP, dimIndex);
const int atomIndex = sourceIndex / minStride;
if (sourceIndex == minStride * atomIndex)
{
const gmx_gpu_info_t &GPU_FUNC_ARGUMENT(gpu_info),
int GPU_FUNC_ARGUMENT(index)) GPU_FUNC_TERM;
-/*! \brief Returns whether all compatible OpenCL devices are from AMD.
- *
- * This is currently the most useful and best tested platform for
- * supported OpenCL devices, so some modules may need to check what
- * degree of support they should offer.
- *
- * \todo An enumeration visible in the hardware module would make such
- * checks more configurable, if we discover other needs in future.
- *
- * \returns whether all detected compatible devices have AMD for the vendor.
- */
-OPENCL_FUNC_QUALIFIER
-bool areAllGpuDevicesFromAmd(const gmx_gpu_info_t &OPENCL_FUNC_ARGUMENT(gpuInfo))
-OPENCL_FUNC_TERM_WITH_RETURN(false);
/*! \brief Returns the size of the gpu_dev_info struct.
*
}
}
-bool areAllGpuDevicesFromAmd(const gmx_gpu_info_t &gpuInfo)
-{
- bool result = true;
- for (int i = 0; i < gpuInfo.n_dev; ++i)
- {
- if ((gpuInfo.gpu_dev[i].stat == egpuCompatible) &&
- (gpuInfo.gpu_dev[i].vendor_e != OCL_VENDOR_AMD))
- {
- result = false;
- break;
- }
- }
- return result;
-}
void init_gpu(const gmx_device_info_t *deviceInfo)
{