Fix OpenCL gather reduction

author Szilárd Páll <pall.szilard@gmail.com>

Fri, 15 Feb 2019 19:14:16 +0000 (20:14 +0100)

committer Mark Abraham <mark.j.abraham@gmail.com>

Thu, 15 Aug 2019 19:46:18 +0000 (21:46 +0200)
author Szilárd Páll <pall.szilard@gmail.com>
Fri, 15 Feb 2019 19:14:16 +0000 (20:14 +0100)
committer Mark Abraham <mark.j.abraham@gmail.com>
Thu, 15 Aug 2019 19:46:18 +0000 (21:46 +0200)
diff --git a/docs/release-notes/2020/major/portability.rst b/docs/release-notes/2020/major/portability.rst

index c1c866ca1d59106152a796c12bf791344c30b0c9..ae268a83a704f54edbe1b781289a6ecb08581589 100644 (file)
--- a/docs/release-notes/2020/major/portability.rst
+++ b/docs/release-notes/2020/major/portability.rst
@@ -12,3 +12,8 @@ Added support for Hygon Dhyana CPU architecture
  Support for hardware detection and related heuristics has been implemented
  for the Hygon Dhyana derived from the first-gen AMD Zen which it shares most
  of its architectural details with.
+
+Enabled PME offload support with OpenCL on NVIDIA and Intel GPUs
+""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+Thanks to portability improvements, the previously disabled PME OpenCL offload
+is now enabled also on NVIDIA and Intel GPUs.
diff --git a/docs/user-guide/mdrun-performance.rst b/docs/user-guide/mdrun-performance.rst

index 6ee02260f8e693bc94881d7f85b7c01cfced3be8..d663af68a9af6c609f1d2d60464118ab0dcddcea 100644 (file)
--- a/docs/user-guide/mdrun-performance.rst
+++ b/docs/user-guide/mdrun-performance.rst
@@ -1275,14 +1275,12 @@ Limitations in the current OpenCL support of interest to |Gromacs| users:
  - On NVIDIA GPUs the OpenCL kernels achieve much lower performance
    than the equivalent CUDA kernels due to limitations of the NVIDIA OpenCL
    compiler.
-- PME is currently only supported on AMD devices, because of known
-  issues with devices from other vendors
  
  Limitations of interest to |Gromacs| developers:
  
-- The current implementation is not compatible with OpenCL devices that are
-  not using warp/wavefronts or for which the warp/wavefront size is not a
-  multiple of 32
+- The current implementation requires a minimum execution with of 16; kernels
+  compiled for narrower execution width (be it due to hardware requirements or
+  compiler choice) will not be suitable and will trigger a runtime error.
  
  Performance checklist
  ---------------------
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp

index 39197ecaf3d8078b3e830d850f44b125c687a91c..f08dcfd0e55443aafacabcfc3210c98feadaa058 100644 (file)
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -166,16 +166,13 @@ bool pme_gpu_supports_build(std::string *error)
      return addMessageIfNotSupported(errorReasons, error);
  }
  
-bool pme_gpu_supports_hardware(const gmx_hw_info_t &hwinfo,
-                               std::string         *error)
+bool pme_gpu_supports_hardware(const gmx_hw_info_t gmx_unused &hwinfo,
+                               std::string                    *error)
  {
      std::list<std::string> errorReasons;
+
      if (GMX_GPU == GMX_GPU_OPENCL)
      {
-        if (!areAllGpuDevicesFromAmd(hwinfo.gpu_info))
-        {
-            errorReasons.emplace_back("non-AMD devices");
-        }
  #ifdef __APPLE__
          errorReasons.emplace_back("Apple OS X operating system");
  #endif
diff --git a/src/gromacs/ewald/pme_gather.clh b/src/gromacs/ewald/pme_gather.clh

index ba3ed180fe0951e80ff0fe065cb4a029f5591d00..e2ad100e48d07a7b774671fd0a311d37d9d96b32 100644 (file)
--- a/src/gromacs/ewald/pme_gather.clh
+++ b/src/gromacs/ewald/pme_gather.clh
@@ -122,13 +122,10 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
          int elementIndex = smemReserved + lineIndex;
          // Store input force contributions
          sm_forceReduction[elementIndex] = (dimIndex == XX) ? fx : (dimIndex == YY) ? fy : fz;
-
-#if !defined(_AMD_SOURCE_)
-        /* This barrier was not needed in CUDA, nor is it needed on AMD GPUs.
-         * Different OpenCL compilers might have different ideas
-         * about #pragma unroll, though. OpenCL 2 has _attribute__((opencl_unroll_hint)).
-         * #2519
-         */
+#if (warp_size < 48)
+        // sync here when exec width is smaller than the size of the sm_forceReduction
+        // buffer flushed to local mem above (size 3*16) as different warps will consume
+        // the data below.
          barrier(CLK_LOCAL_MEM_FENCE);
  #endif
  
@@ -150,9 +147,16 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
              const int packedIndex = atomIndexLocal * redStride + splineIndex;
              sm_forceTemp[dimIndex][packedIndex] = sm_forceReduction[elementIndex] + sm_forceReduction[elementIndex + redStride];
          }
+
+        // barrier only needed for the last iteration on hardware with >=64-wide execution (e.g. AMD)
+#if (warp_size < 64)
+        barrier(CLK_LOCAL_MEM_FENCE);
+#endif
      }
  
+#if (warp_size >= 64)
      barrier(CLK_LOCAL_MEM_FENCE);
+#endif
  
      assert ((blockSize / warp_size) >= DIM);
  
@@ -162,7 +166,7 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
      // First 3 warps can now process 1 dimension each
      if (dimIndex < DIM)
      {
-        int sourceIndex = lineIndex % warp_size;
+        const int sourceIndex = lineIndex % warp_size;
  #pragma unroll
          for (int redStride = minStride >> 1; redStride > 1; redStride >>= 1)
          {
@@ -172,8 +176,7 @@ inline void reduce_atom_forces(__local float * __restrict__  sm_forces,
              }
          }
  
-        const float n = read_grid_size(realGridSizeFP, dimIndex);
-
+        const float n         = read_grid_size(realGridSizeFP, dimIndex);
          const int   atomIndex = sourceIndex / minStride;
          if (sourceIndex == minStride * atomIndex)
          {
diff --git a/src/gromacs/gpu_utils/gpu_utils.h b/src/gromacs/gpu_utils/gpu_utils.h

index 0607e02e2893bc7b6949605541467f30060206a5..ea9241b9004aa13b3d7b1dae1dc3a5a86637d12d 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils.h
+++ b/src/gromacs/gpu_utils/gpu_utils.h
@@ -211,20 +211,6 @@ void get_gpu_device_info_string(char *GPU_FUNC_ARGUMENT(s),
                                  const gmx_gpu_info_t &GPU_FUNC_ARGUMENT(gpu_info),
                                  int GPU_FUNC_ARGUMENT(index)) GPU_FUNC_TERM;
  
-/*! \brief Returns whether all compatible OpenCL devices are from AMD.
- *
- * This is currently the most useful and best tested platform for
- * supported OpenCL devices, so some modules may need to check what
- * degree of support they should offer.
- *
- * \todo An enumeration visible in the hardware module would make such
- * checks more configurable, if we discover other needs in future.
- *
- * \returns whether all detected compatible devices have AMD for the vendor.
- */
-OPENCL_FUNC_QUALIFIER
-bool areAllGpuDevicesFromAmd(const gmx_gpu_info_t &OPENCL_FUNC_ARGUMENT(gpuInfo))
-OPENCL_FUNC_TERM_WITH_RETURN(false);
  
  /*! \brief Returns the size of the gpu_dev_info struct.
   *
diff --git a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp

index 8c695549f636f585416b68ec43a647758d57b78a..2dd684891637e7da81c4f660b07a170148fff0e7 100644 (file)
--- a/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils_ocl.cpp
@@ -544,20 +544,6 @@ void get_gpu_device_info_string(char *s, const gmx_gpu_info_t &gpu_info, int ind
      }
  }
  
-bool areAllGpuDevicesFromAmd(const gmx_gpu_info_t &gpuInfo)
-{
-    bool result = true;
-    for (int i = 0; i < gpuInfo.n_dev; ++i)
-    {
-        if ((gpuInfo.gpu_dev[i].stat == egpuCompatible) &&
-            (gpuInfo.gpu_dev[i].vendor_e != OCL_VENDOR_AMD))
-        {
-            result = false;
-            break;
-        }
-    }
-    return result;
-}
  
  void init_gpu(const gmx_device_info_t *deviceInfo)
  {
author	Szilárd Páll <pall.szilard@gmail.com>
	Fri, 15 Feb 2019 19:14:16 +0000 (20:14 +0100)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Thu, 15 Aug 2019 19:46:18 +0000 (21:46 +0200)
docs/release-notes/2020/major/portability.rst		patch \| blob \| history
docs/user-guide/mdrun-performance.rst		patch \| blob \| history
src/gromacs/ewald/pme.cpp		patch \| blob \| history
src/gromacs/ewald/pme_gather.clh		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils.h		patch \| blob \| history
src/gromacs/gpu_utils/gpu_utils_ocl.cpp		patch \| blob \| history