From: Szilárd Páll Date: Fri, 15 Feb 2019 19:14:16 +0000 (+0100) Subject: Fix OpenCL gather reduction X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=a19dd7d50c7cedabf746272c2d378dd4b9dbcefb;p=alexxy%2Fgromacs.git Fix OpenCL gather reduction On >=16-wide execution it is correct (narrower is checked and excluded during compilation). TODO: Consider changing the default on NVIDIA & Intel where offloading PME is generally not advantageous to performance. Addresses part of #2519 Refs #2453 #2516 Change-Id: I24beaaeea096954ba32b3a80251945a9d82a3c05 --- diff --git a/docs/release-notes/2020/major/portability.rst b/docs/release-notes/2020/major/portability.rst index c1c866ca1d..ae268a83a7 100644 --- a/docs/release-notes/2020/major/portability.rst +++ b/docs/release-notes/2020/major/portability.rst @@ -12,3 +12,8 @@ Added support for Hygon Dhyana CPU architecture Support for hardware detection and related heuristics has been implemented for the Hygon Dhyana derived from the first-gen AMD Zen which it shares most of its architectural details with. + +Enabled PME offload support with OpenCL on NVIDIA and Intel GPUs +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" +Thanks to portability improvements, the previously disabled PME OpenCL offload +is now enabled also on NVIDIA and Intel GPUs. diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst index 6ee02260f8..d663af68a9 100644 --- a/docs/user-guide/mdrun-performance.rst +++ b/docs/user-guide/mdrun-performance.rst @@ -1275,14 +1275,12 @@ Limitations in the current OpenCL support of interest to |Gromacs| users: - On NVIDIA GPUs the OpenCL kernels achieve much lower performance than the equivalent CUDA kernels due to limitations of the NVIDIA OpenCL compiler. -- PME is currently only supported on AMD devices, because of known - issues with devices from other vendors Limitations of interest to |Gromacs| developers: -- The current implementation is not compatible with OpenCL devices that are - not using warp/wavefronts or for which the warp/wavefront size is not a - multiple of 32 +- The current implementation requires a minimum execution with of 16; kernels + compiled for narrower execution width (be it due to hardware requirements or + compiler choice) will not be suitable and will trigger a runtime error. Performance checklist --------------------- diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp index 39197ecaf3..f08dcfd0e5 100644 --- a/src/gromacs/ewald/pme.cpp +++ b/src/gromacs/ewald/pme.cpp @@ -166,16 +166,13 @@ bool pme_gpu_supports_build(std::string *error) return addMessageIfNotSupported(errorReasons, error); } -bool pme_gpu_supports_hardware(const gmx_hw_info_t &hwinfo, - std::string *error) +bool pme_gpu_supports_hardware(const gmx_hw_info_t gmx_unused &hwinfo, + std::string *error) { std::list errorReasons; + if (GMX_GPU == GMX_GPU_OPENCL) { - if (!areAllGpuDevicesFromAmd(hwinfo.gpu_info)) - { - errorReasons.emplace_back("non-AMD devices"); - } #ifdef __APPLE__ errorReasons.emplace_back("Apple OS X operating system"); #endif diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh index ba3ed180fe..e2ad100e48 100644 --- a/src/gromacs/ewald/pme_gather.clh +++ b/src/gromacs/ewald/pme_gather.clh @@ -122,13 +122,10 @@ inline void reduce_atom_forces(__local float * __restrict__ sm_forces, int elementIndex = smemReserved + lineIndex; // Store input force contributions sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz; - -#if !defined(_AMD_SOURCE_) - /* This barrier was not needed in CUDA, nor is it needed on AMD GPUs. - * Different OpenCL compilers might have different ideas - * about #pragma unroll, though. OpenCL 2 has _attribute__((opencl_unroll_hint)). - * #2519 - */ +#if (warp_size < 48) + // sync here when exec width is smaller than the size of the sm_forceReduction + // buffer flushed to local mem above (size 3*16) as different warps will consume + // the data below. barrier(CLK_LOCAL_MEM_FENCE); #endif @@ -150,9 +147,16 @@ inline void reduce_atom_forces(__local float * __restrict__ sm_forces, const int packedIndex = atomIndexLocal * redStride + splineIndex; sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride]; } + + // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD) +#if (warp_size < 64) + barrier(CLK_LOCAL_MEM_FENCE); +#endif } +#if (warp_size >= 64) barrier(CLK_LOCAL_MEM_FENCE); +#endif assert ((blockSize / warp_size) >= DIM); @@ -162,7 +166,7 @@ inline void reduce_atom_forces(__local float * __restrict__ sm_forces, // First 3 warps can now process 1 dimension each if (dimIndex < DIM) { - int sourceIndex = lineIndex % warp_size; + const int sourceIndex = lineIndex % warp_size; #pragma unroll for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1) { @@ -172,8 +176,7 @@ inline void reduce_atom_forces(__local float * __restrict__ sm_forces, } } - const float n = read_grid_size(realGridSizeFP, dimIndex); - + const float n = read_grid_size(realGridSizeFP, dimIndex); const int atomIndex = sourceIndex / minStride; if (sourceIndex == minStride * atomIndex) { diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h index 0607e02e28..ea9241b900 100644 --- a/src/gromacs/gpu_utils/gpu_utils.h +++ b/src/gromacs/gpu_utils/gpu_utils.h @@ -211,20 +211,6 @@ void get_gpu_device_info_string(char *GPU_FUNC_ARGUMENT(s), const gmx_gpu_info_t &GPU_FUNC_ARGUMENT(gpu_info), int GPU_FUNC_ARGUMENT(index)) GPU_FUNC_TERM; -/*! \brief Returns whether all compatible OpenCL devices are from AMD. - * - * This is currently the most useful and best tested platform for - * supported OpenCL devices, so some modules may need to check what - * degree of support they should offer. - * - * \todo An enumeration visible in the hardware module would make such - * checks more configurable, if we discover other needs in future. - * - * \returns whether all detected compatible devices have AMD for the vendor. - */ -OPENCL_FUNC_QUALIFIER -bool areAllGpuDevicesFromAmd(const gmx_gpu_info_t &OPENCL_FUNC_ARGUMENT(gpuInfo)) -OPENCL_FUNC_TERM_WITH_RETURN(false); /*! \brief Returns the size of the gpu_dev_info struct. * diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp index 8c695549f6..2dd6848916 100644 --- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp +++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp @@ -544,20 +544,6 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind } } -bool areAllGpuDevicesFromAmd(const gmx_gpu_info_t &gpuInfo) -{ - bool result = true; - for (int i = 0; i < gpuInfo.n_dev; ++i) - { - if ((gpuInfo.gpu_dev[i].stat == egpuCompatible) && - (gpuInfo.gpu_dev[i].vendor_e != OCL_VENDOR_AMD)) - { - result = false; - break; - } - } - return result; -} void init_gpu(const gmx_device_info_t *deviceInfo) {