From: Szilárd Páll <pall.szilard@gmail.com>
Date: Fri, 15 Feb 2019 19:14:16 +0000 (+0100)
Subject: Fix OpenCL gather reduction
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=a19dd7d50c7cedabf746272c2d378dd4b9dbcefb;p=alexxy%2Fgromacs.git

Fix OpenCL gather reduction

On >=16-wide execution it is correct (narrower is checked and excluded
during compilation).

TODO: Consider changing the default on NVIDIA & Intel where offloading
PME is generally not advantageous to performance.

Addresses part of #2519
Refs #2453 #2516

Change-Id: I24beaaeea096954ba32b3a80251945a9d82a3c05
---

diff --git a/docs/release-notes/2020/major/portability.rst b/docs/release-notes/2020/major/portability.rst
index c1c866ca1d..ae268a83a7 100644
--- a/docs/release-notes/2020/major/portability.rst
+++ b/docs/release-notes/2020/major/portability.rst
@@ -12,3 +12,8 @@ Added support for Hygon Dhyana CPU architecture
 Support for hardware detection and related heuristics has been implemented
 for the Hygon Dhyana derived from the first-gen AMD Zen which it shares most
 of its architectural details with.
+
+Enabled PME offload support with OpenCL on NVIDIA and Intel GPUs
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+Thanks to portability improvements, the previously disabled PME OpenCL offload
+is now enabled also on NVIDIA and Intel GPUs.
diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst
index 6ee02260f8..d663af68a9 100644
--- a/docs/user-guide/mdrun-performance.rst
+++ b/docs/user-guide/mdrun-performance.rst
@@ -1275,14 +1275,12 @@ Limitations in the current OpenCL support of interest to |Gromacs| users:
 - On NVIDIA GPUs the OpenCL kernels achieve much lower performance
   than the equivalent CUDA kernels due to limitations of the NVIDIA OpenCL
   compiler.
-- PME is currently only supported on AMD devices, because of known
-  issues with devices from other vendors
 
 Limitations of interest to |Gromacs| developers:
 
-- The current implementation is not compatible with OpenCL devices that are
-  not using warp/wavefronts or for which the warp/wavefront size is not a
-  multiple of 32
+- The current implementation requires a minimum execution with of 16; kernels
+  compiled for narrower execution width (be it due to hardware requirements or
+  compiler choice) will not be suitable and will trigger a runtime error.
 
 Performance checklist
 ---------------------
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp
index 39197ecaf3..f08dcfd0e5 100644
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -166,16 +166,13 @@ bool pme_gpu_supports_build(std::string *error)
     return addMessageIfNotSupported(errorReasons, error);
 }
 
-bool pme_gpu_supports_hardware(const gmx_hw_info_t &hwinfo,
-                               std::string         *error)
+bool pme_gpu_supports_hardware(const gmx_hw_info_t gmx_unused &hwinfo,
+                               std::string                    *error)
 {
     std::list<std::string> errorReasons;
+
     if (GMX_GPU == GMX_GPU_OPENCL)
     {
-        if (!areAllGpuDevicesFromAmd(hwinfo.gpu_info))
-        {
-            errorReasons.emplace_back("non-AMD devices");
-        }
 #ifdef __APPLE__
         errorReasons.emplace_back("Apple OS X operating system");
 #endif
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh
index ba3ed180fe..e2ad100e48 100644
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -122,13 +122,10 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
         int elementIndex = smemReserved + lineIndex;
         // Store input force contributions
         sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
-
-#if !defined(_AMD_SOURCE_)
-        /* This barrier was not needed in CUDA, nor is it needed on AMD GPUs.
-         * Different OpenCL compilers might have different ideas
-         * about #pragma unroll, though. OpenCL 2 has _attribute__((opencl_unroll_hint)).
-         * #2519
-         */
+#if (warp_size < 48)
+        // sync here when exec width is smaller than the size of the sm_forceReduction
+        // buffer flushed to local mem above (size 3*16) as different warps will consume
+        // the data below.
         barrier(CLK_LOCAL_MEM_FENCE);
 #endif
 
@@ -150,9 +147,16 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
             const int packedIndex = atomIndexLocal * redStride + splineIndex;
             sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
         }
+
+        // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD)
+#if (warp_size < 64)
+        barrier(CLK_LOCAL_MEM_FENCE);
+#endif
     }
 
+#if (warp_size >= 64)
     barrier(CLK_LOCAL_MEM_FENCE);
+#endif
 
     assert ((blockSize / warp_size) >= DIM);
 
@@ -162,7 +166,7 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
     // First 3 warps can now process 1 dimension each
     if (dimIndex < DIM)
     {
-        int sourceIndex = lineIndex % warp_size;
+        const int sourceIndex = lineIndex % warp_size;
 #pragma unroll
         for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
         {
@@ -172,8 +176,7 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
             }
         }
 
-        const float n = read_grid_size(realGridSizeFP, dimIndex);
-
+        const float n         = read_grid_size(realGridSizeFP, dimIndex);
         const int   atomIndex = sourceIndex / minStride;
         if (sourceIndex == minStride * atomIndex)
         {
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h
index 0607e02e28..ea9241b900 100644
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -211,20 +211,6 @@ void get_gpu_device_info_string(char *GPU_FUNC_ARGUMENT(s),
                                 const gmx_gpu_info_t &GPU_FUNC_ARGUMENT(gpu_info),
                                 int GPU_FUNC_ARGUMENT(index)) GPU_FUNC_TERM;
 
-/*! \brief Returns whether all compatible OpenCL devices are from AMD.
- *
- * This is currently the most useful and best tested platform for
- * supported OpenCL devices, so some modules may need to check what
- * degree of support they should offer.
- *
- * \todo An enumeration visible in the hardware module would make such
- * checks more configurable, if we discover other needs in future.
- *
- * \returns whether all detected compatible devices have AMD for the vendor.
- */
-OPENCL_FUNC_QUALIFIER
-bool areAllGpuDevicesFromAmd(const gmx_gpu_info_t &OPENCL_FUNC_ARGUMENT(gpuInfo))
-OPENCL_FUNC_TERM_WITH_RETURN(false);
 
 /*! \brief Returns the size of the gpu_dev_info struct.
  *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
index 8c695549f6..2dd6848916 100644
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -544,20 +544,6 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind
     }
 }
 
-bool areAllGpuDevicesFromAmd(const gmx_gpu_info_t &gpuInfo)
-{
-    bool result = true;
-    for (int i = 0; i < gpuInfo.n_dev; ++i)
-    {
-        if ((gpuInfo.gpu_dev[i].stat == egpuCompatible) &&
-            (gpuInfo.gpu_dev[i].vendor_e != OCL_VENDOR_AMD))
-        {
-            result = false;
-            break;
-        }
-    }
-    return result;
-}
 
 void init_gpu(const gmx_device_info_t *deviceInfo)
 {