Set up build with hipSYCL

author Andrey Alekseenko <al42and@gmail.com>

Tue, 2 Mar 2021 07:30:21 +0000 (07:30 +0000)

committer Mark Abraham <mark.j.abraham@gmail.com>

Tue, 2 Mar 2021 07:30:21 +0000 (07:30 +0000)
author Andrey Alekseenko <al42and@gmail.com>
Tue, 2 Mar 2021 07:30:21 +0000 (07:30 +0000)
committer Mark Abraham <mark.j.abraham@gmail.com>
Tue, 2 Mar 2021 07:30:21 +0000 (07:30 +0000)
diff --git a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml

index 75d9cac850dd0f984719bf7cee74337d8f984019..bab6c3854487f75727895f3827ba14732eda47c0 100644 (file)
--- a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml
+++ b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml
@@ -271,6 +271,7 @@ include:
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0-release.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-9-release.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0-release.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-9-release.gitlab-ci.yml'
+  - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl-release.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-sycl.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl-release.gitlab-ci.yml'
    - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-sycl.gitlab-ci.yml'
diff --git a/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml

new file mode 100644 (file)

index 0000000..ba7c33c
--- /dev/null
+++ b/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml
@@ -0,0 +1,44 @@
+# Test goal: build with hipSYCL (both CUDA and ROCm backends) to check SYCL code compatibility
+# Test intents (should change rarely and conservatively):
+#   OS: Ubuntu newest supported
+#   Compiler: Clang newest supported
+#   GPU: hipSYCL
+#   Scope: configure, build
+# Test implementation choices (free to change as needed):
+#   OS: Ubuntu 20.04
+#   Build type: RelWithAssert
+#   Compiler: Clang 11
+#   MPI: thread_MPI
+#   SIMD: AVX2_256
+
+gromacs:hipsycl-dev:configure:
+  extends:
+    - .gromacs:base:configure
+    - .use-clang:base
+    - .use-sycl
+    - .rules:merge-and-post-merge-acceptance
+  image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+  variables:
+    CMAKE: /usr/local/cmake-3.18.4/bin/cmake
+    CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=AVX2_256"
+    CMAKE_BUILD_TYPE_OPTIONS: "-DCMAKE_BUILD_TYPE=RelWithAssert"
+    CMAKE_GPU_OPTIONS: "-DGMX_GPU=SYCL -DGMX_SYCL_HIPSYCL=ON -DHIPSYCL_TARGETS='cuda:sm_60,sm_61,sm_70,sm_75;hip:gfx900'"
+    # Unset COMPILER_LAUNCHER (previously set to ccache) because it conflicts with hipSYCL's syclcc-launcher
+    CMAKE_EXTRA_OPTIONS: "-DCMAKE_C_COMPILER_LAUNCHER= -DCMAKE_CXX_COMPILER_LAUNCHER="
+    COMPILER_MAJOR_VERSION: 11
+
+gromacs:hipsycl-dev:build:
+  extends:
+    - .variables:default
+    - .gromacs:base:build
+    - .before_script:default
+    # Not using ccache because it plays poorly with syclcc-launcher
+    - .rules:merge-and-post-merge-acceptance
+  image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+  variables:
+    CMAKE: /usr/local/cmake-3.18.4/bin/cmake
+  tags:
+    - k8s-scilifelab
+  needs:
+    - job: gromacs:hipsycl-dev:configure
+
diff --git a/cmake/gmxManageSYCL.cmake b/cmake/gmxManageSYCL.cmake

index af133a12fabe3ce176fc90104bf769a9f579fa7b..cf5117453655637dc57b61e6abb156950ef57b7a 100644 (file)
--- a/cmake/gmxManageSYCL.cmake
+++ b/cmake/gmxManageSYCL.cmake
@@ -55,6 +55,10 @@ if(GMX_SYCL_HIPSYCL)
          message(FATAL_ERROR "hipSYCL can only be built with Clang++ compiler")
      endif()
      set(HIPSYCL_CLANG "${CMAKE_CXX_COMPILER}")
          message(FATAL_ERROR "hipSYCL can only be built with Clang++ compiler")
      endif()
      set(HIPSYCL_CLANG "${CMAKE_CXX_COMPILER}")
+    # -Wno-unknown-cuda-version because Clang-11 complains about CUDA 11.0-11.2, despite working fine with them.
+    # -Wno-unknown-attributes because hipSYCL does not support reqd_sub_group_size (because it can only do some sub group sizes).
+    # --hipsycl-explicit-multipass is needed when building for both CUDA and HIP.
+    set(HIPSYCL_SYCLCC_EXTRA_ARGS "-Wno-unknown-cuda-version -Wno-unknown-attributes --hipsycl-explicit-multipass")
      find_package(hipsycl REQUIRED)
  else()
      if(CMAKE_CXX_COMPILER MATCHES "dpcpp")
      find_package(hipsycl REQUIRED)
  else()
      if(CMAKE_CXX_COMPILER MATCHES "dpcpp")
diff --git a/src/config.h.cmakein b/src/config.h.cmakein

index 47797c46c6080635b8f294c6f1b41f7c2f8a174b..2c1da097f048d62e0b9fde44256f6fc57d8ab556 100644 (file)
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -227,6 +227,12 @@
  /* Define if SYCL GPU acceleration is compiled */
  #cmakedefine01 GMX_GPU_SYCL
  
  /* Define if SYCL GPU acceleration is compiled */
  #cmakedefine01 GMX_GPU_SYCL
  
+/* Define if hipSYCL is used for SYCL support (otherwise DPCPP is assumed) */
+#cmakedefine01 GMX_SYCL_HIPSYCL
+
+/* Define if Intel's DPCPP is used (default unless hipSYCL is chosen) */
+#define GMX_SYCL_DPCPP (!GMX_SYCL_HIPSYCL)
+
  /* Use a single compilation unit when compiling the CUDA (non-bonded) kernels.  */
  #cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
  
  /* Use a single compilation unit when compiling the CUDA (non-bonded) kernels.  */
  #cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
  
diff --git a/src/gromacs/gpu_utils/device_stream_sycl.cpp b/src/gromacs/gpu_utils/device_stream_sycl.cpp

index 72a431e121b920b55f4642a46c4e9bb13c6dcd2c..48c6217ab74149d48e277108ba18857759286e2a 100644 (file)
--- a/src/gromacs/gpu_utils/device_stream_sycl.cpp
+++ b/src/gromacs/gpu_utils/device_stream_sycl.cpp
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -60,7 +60,12 @@ DeviceStream::DeviceStream(const DeviceContext& deviceContext,
          const bool deviceSupportsTiming = device.get_info<cl::sycl::info::device::queue_profiling>();
          if (deviceSupportsTiming)
          {
          const bool deviceSupportsTiming = device.get_info<cl::sycl::info::device::queue_profiling>();
          if (deviceSupportsTiming)
          {
+#if (!GMX_SYCL_HIPSYCL)
+            /* Support for profiling and even the `::enable_profile` property is added in
+             * https://github.com/illuhad/hipSYCL/pull/428, which is not merged at the
+             * time of writing */
              propertyList = cl::sycl::property::queue::enable_profiling();
              propertyList = cl::sycl::property::queue::enable_profiling();
+#endif
          }
      }
      stream_ = cl::sycl::queue(deviceContext.context(), device, propertyList);
          }
      }
      stream_ = cl::sycl::queue(deviceContext.context(), device, propertyList);
diff --git a/src/gromacs/gpu_utils/gmxsycl.h b/src/gromacs/gpu_utils/gmxsycl.h

index 8eb5cabc9fda44fda285e0bbe0b9c79d9d19bdae..77891f937dc1b208e3c5af4627653eb7cddddd88 100644 (file)
--- a/src/gromacs/gpu_utils/gmxsycl.h
+++ b/src/gromacs/gpu_utils/gmxsycl.h
@@ -52,11 +52,13 @@
  #ifndef GMX_GPU_UTILS_GMXSYCL_H
  #define GMX_GPU_UTILS_GMXSYCL_H
  
  #ifndef GMX_GPU_UTILS_GMXSYCL_H
  #define GMX_GPU_UTILS_GMXSYCL_H
  
+#include "config.h"
+
  /* Some versions of Intel ICPX compiler (at least 2021.1.1 and 2021.1.2) fail to unroll a loop
   * in sycl::accessor::__init, and emit -Wpass-failed=transform-warning. This is a useful
   * warning, but mostly noise right now. Probably related to using shared memory accessors.
   * The unroll directive was introduced in https://github.com/intel/llvm/pull/2449. */
  /* Some versions of Intel ICPX compiler (at least 2021.1.1 and 2021.1.2) fail to unroll a loop
   * in sycl::accessor::__init, and emit -Wpass-failed=transform-warning. This is a useful
   * warning, but mostly noise right now. Probably related to using shared memory accessors.
   * The unroll directive was introduced in https://github.com/intel/llvm/pull/2449. */
-#if defined(__INTEL_LLVM_COMPILER)
+#if GMX_SYCL_DPCPP
  #    include <CL/sycl/version.hpp>
  #    define DISABLE_UNROLL_WARNINGS \
          ((__SYCL_COMPILER_VERSION >= 20201113) && (__SYCL_COMPILER_VERSION <= 20201214))
  #    include <CL/sycl/version.hpp>
  #    define DISABLE_UNROLL_WARNINGS \
          ((__SYCL_COMPILER_VERSION >= 20201113) && (__SYCL_COMPILER_VERSION <= 20201214))
@@ -69,6 +71,29 @@
  #    pragma clang diagnostic ignored "-Wpass-failed"
  #endif
  
  #    pragma clang diagnostic ignored "-Wpass-failed"
  #endif
  
+// For hipSYCL, we need to activate floating-point atomics
+#if GMX_SYCL_HIPSYCL
+#    define HIPSYCL_EXT_FP_ATOMICS
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wunused-variable"
+#    pragma clang diagnostic ignored "-Wunused-parameter"
+#    pragma clang diagnostic ignored "-Wmissing-noreturn"
+#    pragma clang diagnostic ignored "-Wshadow-field"
+#    pragma clang diagnostic ignored "-Wctad-maybe-unsupported"
+#    pragma clang diagnostic ignored "-Wdeprecated-copy-dtor"
+#    pragma clang diagnostic ignored "-Winconsistent-missing-destructor-override"
+#    pragma clang diagnostic ignored "-Wunused-template"
+#    pragma clang diagnostic ignored "-Wsign-compare"
+#    pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
+#    pragma clang diagnostic ignored "-Wdeprecated-copy"
+#    pragma clang diagnostic ignored "-Wnewline-eof"
+#    pragma clang diagnostic ignored "-Wextra-semi"
+#    pragma clang diagnostic ignored "-Wsuggest-override"
+#    pragma clang diagnostic ignored "-Wsuggest-destructor-override"
+#    pragma clang diagnostic ignored "-Wgcc-compat"
+#endif
+
+
  #ifdef DIM
  #    if DIM != 3
  #        error "The workaround here assumes we use DIM=3."
  #ifdef DIM
  #    if DIM != 3
  #        error "The workaround here assumes we use DIM=3."
@@ -85,6 +110,10 @@
  #    pragma clang diagnostic pop
  #endif
  
  #    pragma clang diagnostic pop
  #endif
  
+#if GMX_SYCL_HIPSYCL
+#    pragma clang diagnostic pop
+#endif
+
  #undef DISABLE_UNROLL_WARNINGS
  
  /* Exposing Intel-specific extensions in a manner compatible with SYCL2020 provisional spec.
  #undef DISABLE_UNROLL_WARNINGS
  
  /* Exposing Intel-specific extensions in a manner compatible with SYCL2020 provisional spec.
@@ -97,10 +126,10 @@ namespace sycl_2020
  {
  namespace detail
  {
  {
  namespace detail
  {
-#if defined(__SYCL_COMPILER_VERSION) // Intel DPCPP compiler
+#if GMX_SYCL_DPCPP
  // Confirmed to work for 2021.1-beta10 (20201005), 2021.1.1 (20201113), 2021.1.2 (20201214).
  namespace origin = cl::sycl::ONEAPI;
  // Confirmed to work for 2021.1-beta10 (20201005), 2021.1.1 (20201113), 2021.1.2 (20201214).
  namespace origin = cl::sycl::ONEAPI;
-#elif defined(__HIPSYCL__)
+#elif GMX_SYCL_HIPSYCL
  namespace origin = cl::sycl;
  #else
  #    error "Unsupported version of SYCL compiler"
  namespace origin = cl::sycl;
  #else
  #    error "Unsupported version of SYCL compiler"
@@ -112,7 +141,7 @@ using detail::origin::memory_scope;
  using detail::origin::plus;
  using detail::origin::sub_group;
  
  using detail::origin::plus;
  using detail::origin::sub_group;
  
-#if defined(__SYCL_COMPILER_VERSION) // Intel DPCPP compiler
+#if GMX_SYCL_DPCPP
  using detail::origin::atomic_ref;
  template<typename... Args>
  bool group_any_of(Args&&... args)
  using detail::origin::atomic_ref;
  template<typename... Args>
  bool group_any_of(Args&&... args)
@@ -124,8 +153,8 @@ auto group_reduce(Args&&... args) -> decltype(detail::origin::reduce(std::forwar
  {
      return detail::origin::reduce(std::forward<Args>(args)...);
  }
  {
      return detail::origin::reduce(std::forward<Args>(args)...);
  }
-#elif defined(__HIPSYCL__)
-// No atomic_ref in hipSYCL yet (2021-01-29)
+#elif GMX_SYCL_HIPSYCL
+// No atomic_ref in hipSYCL yet (2021-02-22)
  using detail::origin::group_any_of;
  using detail::origin::group_reduce;
  #else
  using detail::origin::group_any_of;
  using detail::origin::group_reduce;
  #else
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h

index cdbed7cc1b972665edd49cbfeb9425ddc0f26df6..952fedf242ffb2852098e5670fbc6bcbb9f20f66 100644 (file)
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
@@ -98,16 +98,22 @@ public:
       */
      inline void markEvent(const DeviceStream& deviceStream)
      {
       */
      inline void markEvent(const DeviceStream& deviceStream)
      {
+#    if GMX_SYCL_HIPSYCL
+        deviceStream.stream().wait_and_throw(); // SYCL-TODO: Use CUDA/HIP-specific solutions
+#    else
          GMX_ASSERT(!event_.has_value(), "Do not call markEvent more than once!");
          // Relies on SYCL_INTEL_enqueue_barrier
          event_ = deviceStream.stream().submit_barrier();
          GMX_ASSERT(!event_.has_value(), "Do not call markEvent more than once!");
          // Relies on SYCL_INTEL_enqueue_barrier
          event_ = deviceStream.stream().submit_barrier();
+#    endif
      }
      /*! \brief Synchronizes the host thread on the marked event.
       * As in the OpenCL implementation, the event is released.
       */
      inline void waitForEvent()
      {
      }
      /*! \brief Synchronizes the host thread on the marked event.
       * As in the OpenCL implementation, the event is released.
       */
      inline void waitForEvent()
      {
+#    if (!GMX_SYCL_HIPSYCL)
          event_->wait_and_throw();
          event_->wait_and_throw();
+#    endif
          event_.reset();
      }
      /*! \brief Checks the completion of the underlying event and resets the object if it was. */
          event_.reset();
      }
      /*! \brief Checks the completion of the underlying event and resets the object if it was. */
@@ -126,10 +132,14 @@ public:
       */
      inline void enqueueWaitEvent(const DeviceStream& deviceStream)
      {
       */
      inline void enqueueWaitEvent(const DeviceStream& deviceStream)
      {
+#    if GMX_SYCL_HIPSYCL
+        deviceStream.stream().wait_and_throw(); // SYCL-TODO: Use CUDA/HIP-specific solutions
+#    else
          // Relies on SYCL_INTEL_enqueue_barrier
          const std::vector<cl::sycl::event> waitlist{ event_.value() };
          deviceStream.stream().submit_barrier(waitlist);
          event_.reset();
          // Relies on SYCL_INTEL_enqueue_barrier
          const std::vector<cl::sycl::event> waitlist{ event_.value() };
          deviceStream.stream().submit_barrier(waitlist);
          event_.reset();
+#    endif
      }
      //! Reset the event to unmarked state.
      inline void reset() { event_.reset(); }
      }
      //! Reset the event to unmarked state.
      inline void reset() { event_.reset(); }
diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h

new file mode 100644 (file)

index 0000000..9896a51
--- /dev/null
+++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h
@@ -0,0 +1,168 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H
+#define GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H
+
+#include "gmxsycl.h"
+
+/*! \file
+ *  \brief SYCL kernel helper functions.
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ */
+
+/*! \brief Access mode to use for atomic accessors.
+ *
+ * Intel DPCPP compiler has \c sycl::atomic_ref, but has no \c sycl::atomic_fetch_add for floats.
+ * However, \c atomic_ref can not be constructed from \c sycl::atomic, so we can not use
+ * atomic accessors. Thus, we use \c mode::read_write accessors and \c atomic_ref.
+ *
+ * hipSYCL does not have \c sycl::atomic_ref, but has \c sycl::atomic_fetch_add for floats, which
+ * requires using atomic accessors. Thus, we use \c mode::atomic accessors.
+ *
+ * The \ref atomicFetchAdd function could be used for doing operations on such accessors.
+ */
+static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
+                                                   /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
+
+/*! \brief Convenience wrapper to do atomic addition to a global buffer.
+ *
+ * Skips sub-normal values.
+ *
+ * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
+ */
+template<class IndexType>
+static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const IndexType idx, const float val)
+{
+#if GMX_SYCL_DPCPP
+    if (cl::sycl::isnormal(val))
+    {
+        sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
+                fout_atomic(acc[idx]);
+        fout_atomic.fetch_add(val);
+    }
+#elif GMX_SYCL_HIPSYCL
+    if (std::isnormal(val)) // No sycl::isnormal in hipSYCL
+    {
+#    ifdef SYCL_DEVICE_ONLY
+        /* While there is support for float atomics on device, the host implementation uses
+         * Clang's __atomic_fetch_add intrinsic, that, at least in Clang 11, does not support
+         * floats. Luckily, we don't want to run on host. */
+        acc[idx].fetch_add(val);
+#    else
+        GMX_UNUSED_VALUE(acc);
+        GMX_UNUSED_VALUE(idx);
+#    endif
+    }
+#endif
+}
+
+namespace sycl_2020
+{
+#if GMX_SYCL_HIPSYCL
+__device__ static inline float shift_left(sycl_2020::sub_group, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+    // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet
+#    ifdef SYCL_DEVICE_ONLY
+#        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
+    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
+    return __shfl_down_sync(sc_cudaFullWarpMask, var, delta);
+#        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
+    // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
+    return __shfl_down(var, delta);
+#        else
+#            error "Unsupported hipSYCL target"
+#        endif
+#    else
+    // Should never be called
+    GMX_UNUSED_VALUE(var);
+    GMX_UNUSED_VALUE(delta);
+    assert(false);
+    return NAN;
+#    endif
+}
+__host__ static inline float shift_left(sycl_2020::sub_group, float, sycl_2020::sub_group::linear_id_type)
+{
+    // Should never be called
+    assert(false);
+    return NAN;
+}
+#elif GMX_SYCL_DPCPP
+static inline float shift_left(sycl_2020::sub_group sg, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+    return sg.shuffle_down(var, delta);
+}
+#endif
+
+#if GMX_SYCL_HIPSYCL
+__device__ static inline float shift_right(sycl_2020::sub_group,
+                                           float                                var,
+                                           sycl_2020::sub_group::linear_id_type delta)
+{
+    // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet
+#    ifdef SYCL_DEVICE_ONLY
+#        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
+    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
+    return __shfl_up_sync(sc_cudaFullWarpMask, var, delta);
+#        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
+    // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
+    return __shfl_up(var, delta);
+#        else
+#            error "Unsupported hipSYCL target"
+#        endif
+#    else
+    // Should never be called
+    assert(false);
+    GMX_UNUSED_VALUE(var);
+    GMX_UNUSED_VALUE(delta);
+    return NAN;
+#    endif
+}
+__host__ static inline float shift_right(sycl_2020::sub_group, float, sycl_2020::sub_group::linear_id_type)
+{
+    // Should never be called
+    assert(false);
+    return NAN;
+}
+#elif GMX_SYCL_DPCPP
+static inline float shift_right(sycl_2020::sub_group sg, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+    return sg.shuffle_up(var, delta);
+}
+#endif
+} // namespace sycl_2020
+
+#endif /* GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H */
diff --git a/src/gromacs/hardware/device_management_sycl.cpp b/src/gromacs/hardware/device_management_sycl.cpp

index 4fd401e04b403e4576a6240b0d09b22ffddbcb09..37d739d433db0bae598a788b579ccdf651b5dd32 100644 (file)
--- a/src/gromacs/hardware/device_management_sycl.cpp
+++ b/src/gromacs/hardware/device_management_sycl.cpp
@@ -103,8 +103,20 @@ static DeviceStatus isDeviceCompatible(const cl::sycl::device& syclDevice)
          // Host device does not support subgroups or even querying for sub_group_sizes
          return DeviceStatus::Incompatible;
      }
          // Host device does not support subgroups or even querying for sub_group_sizes
          return DeviceStatus::Incompatible;
      }
+
+#if GMX_SYCL_HIPSYCL
+    /* At the time of writing:
+     * 1. SYCL NB kernels currently don't support sub_group size of 32 or 64, which are the only
+     * ones available on NVIDIA and AMD hardware, respectively. That's not a fundamental limitation,
+     * but requires porting more OpenCL code, see #3934.
+     * 2. hipSYCL does not support cl::sycl::info::device::sub_group_sizes,
+     * see https://github.com/illuhad/hipSYCL/pull/449
+     */
+    const std::vector<size_t> supportedSubGroupSizes{ warpSize };
+#else
      const std::vector<size_t> supportedSubGroupSizes =
              syclDevice.get_info<cl::sycl::info::device::sub_group_sizes>();
      const std::vector<size_t> supportedSubGroupSizes =
              syclDevice.get_info<cl::sycl::info::device::sub_group_sizes>();
+#endif
      const size_t requiredSubGroupSizeForNBNXM = 8;
      if (std::find(supportedSubGroupSizes.begin(), supportedSubGroupSizes.end(), requiredSubGroupSizeForNBNXM)
          == supportedSubGroupSizes.end())
      const size_t requiredSubGroupSizeForNBNXM = 8;
      if (std::find(supportedSubGroupSizes.begin(), supportedSubGroupSizes.end(), requiredSubGroupSizeForNBNXM)
          == supportedSubGroupSizes.end())
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt

index f1d575bfbe2e6fdfaad97d5215e54201e3ef4666..886679bf824830daafd402b46990189f3800310d 100644 (file)
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -2,7 +2,7 @@
  # This file is part of the GROMACS molecular simulation package.
  #
  # Copyright (c) 2010,2012,2013,2014,2015 by the GROMACS development team.
  # This file is part of the GROMACS molecular simulation package.
  #
  # Copyright (c) 2010,2012,2013,2014,2015 by the GROMACS development team.
-# Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
  # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  # and including many others, as listed in the AUTHORS file in the
  # top-level source directory and at http://www.gromacs.org.
@@ -50,7 +50,7 @@ if(GMX_GPU_CUDA)
         )
  endif()
  
         )
  endif()
  
-if(GMX_GPU_SYCL)
+if(GMX_GPU_SYCL AND NOT GMX_SYCL_HIPSYCL)
      gmx_add_libgromacs_sources(
          leapfrog_gpu_sycl.cpp
      )
      gmx_add_libgromacs_sources(
          leapfrog_gpu_sycl.cpp
      )
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.h b/src/gromacs/mdlib/tests/leapfrogtestrunners.h

index 7590aa2dfa3cbdae7733a8e9ed457401d5058e75..5c0d0cb7404575db9856cda0b29c0c864b6f38fc 100644 (file)
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners.h
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.h
@@ -1,7 +1,7 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -54,7 +54,12 @@
  
  #include "leapfrogtestdata.h"
  
  
  #include "leapfrogtestdata.h"
  
-#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || GMX_GPU_SYCL)
+/*
+ * LeapFrog is available with CUDA and SYCL.
+ * However, the use of float3 in CPU code makes it hard to compile with hipSYCL until
+ * Issue #3312 is resolved.
+ */
+#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || (GMX_GPU_SYCL && !GMX_SYCL_HIPSYCL))
  
  namespace gmx
  {
  
  namespace gmx
  {
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp

index 770732c70cc9eda458b02717ce98c64135425642..a372afa77542b646d53bf6a2e413db20558a7f3b 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -301,25 +301,25 @@ static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::r
      return lerp(left, right, fraction); // TODO: cl::sycl::mix
  }
  
      return lerp(left, right, fraction); // TODO: cl::sycl::mix
  }
  
-static inline void reduceForceJShuffle(Float3                                  f,
-                                       const cl::sycl::nd_item<1>              itemIdx,
-                                       const int                               tidxi,
-                                       const int                               aidx,
-                                       DeviceAccessor<float, mode::read_write> a_f)
+static inline void reduceForceJShuffle(Float3                             f,
+                                       const cl::sycl::nd_item<1>         itemIdx,
+                                       const int                          tidxi,
+                                       const int                          aidx,
+                                       DeviceAccessor<float, mode_atomic> a_f)
  {
      static_assert(c_clSize == 8 || c_clSize == 4);
      sycl_2020::sub_group sg = itemIdx.get_sub_group();
  
  {
      static_assert(c_clSize == 8 || c_clSize == 4);
      sycl_2020::sub_group sg = itemIdx.get_sub_group();
  
-    f[0] += shuffleDown(f[0], 1, sg);
-    f[1] += shuffleUp(f[1], 1, sg);
-    f[2] += shuffleDown(f[2], 1, sg);
+    f[0] += sycl_2020::shift_left(sg, f[0], 1);
+    f[1] += sycl_2020::shift_right(sg, f[1], 1);
+    f[2] += sycl_2020::shift_left(sg, f[2], 1);
      if (tidxi & 1)
      {
          f[0] = f[1];
      }
  
      if (tidxi & 1)
      {
          f[0] = f[1];
      }
  
-    f[0] += shuffleDown(f[0], 2, sg);
-    f[2] += shuffleUp(f[2], 2, sg);
+    f[0] += sycl_2020::shift_left(sg, f[0], 2);
+    f[2] += sycl_2020::shift_right(sg, f[2], 2);
      if (tidxi & 2)
      {
          f[0] = f[2];
      if (tidxi & 2)
      {
          f[0] = f[2];
@@ -327,7 +327,7 @@ static inline void reduceForceJShuffle(Float3                                  f
  
      if constexpr (c_clSize == 8)
      {
  
      if constexpr (c_clSize == 8)
      {
-        f[0] += shuffleDown(f[0], 4, sg);
+        f[0] += sycl_2020::shift_left(sg, f[0], 4);
      }
  
      if (tidxi < 3)
      }
  
      if (tidxi < 3)
@@ -344,13 +344,13 @@ static inline void reduceForceJShuffle(Float3                                  f
  static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
                                           const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
                                           const bool   calcFShift,
  static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
                                           const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
                                           const bool   calcFShift,
-                                         const cl::sycl::nd_item<1>              itemIdx,
-                                         const int                               tidxi,
-                                         const int                               tidxj,
-                                         const int                               sci,
-                                         const int                               shift,
-                                         DeviceAccessor<float, mode::read_write> a_f,
-                                         DeviceAccessor<float, mode::read_write> a_fShift)
+                                         const cl::sycl::nd_item<1>         itemIdx,
+                                         const int                          tidxi,
+                                         const int                          tidxj,
+                                         const int                          sci,
+                                         const int                          shift,
+                                         DeviceAccessor<float, mode_atomic> a_f,
+                                         DeviceAccessor<float, mode_atomic> a_fShift)
  {
      static constexpr int bufStride  = c_clSize * c_clSize;
      static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
  {
      static constexpr int bufStride  = c_clSize * c_clSize;
      static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
@@ -417,13 +417,13 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read
   *
   */
  template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
   *
   */
  template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
-auto nbnxmKernel(cl::sycl::handler&                                        cgh,
-                 DeviceAccessor<Float4, mode::read>                        a_xq,
-                 DeviceAccessor<float, mode::read_write>                   a_f,
-                 DeviceAccessor<Float3, mode::read>                        a_shiftVec,
-                 DeviceAccessor<float, mode::read_write>                   a_fShift,
-                 OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
-                 OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
+auto nbnxmKernel(cl::sycl::handler&                                   cgh,
+                 DeviceAccessor<Float4, mode::read>                   a_xq,
+                 DeviceAccessor<float, mode_atomic>                   a_f,
+                 DeviceAccessor<Float3, mode::read>                   a_shiftVec,
+                 DeviceAccessor<float, mode_atomic>                   a_fShift,
+                 OptionalAccessor<float, mode_atomic, doCalcEnergies> a_energyElec,
+                 OptionalAccessor<float, mode_atomic, doCalcEnergies> a_energyVdw,
                   DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
                   DeviceAccessor<nbnxn_sci_t, mode::read>                                 a_plistSci,
                   DeviceAccessor<nbnxn_excl_t, mode::read>                    a_plistExcl,
                   DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
                   DeviceAccessor<nbnxn_sci_t, mode::read>                                 a_plistSci,
                   DeviceAccessor<nbnxn_excl_t, mode::read>                    a_plistExcl,
@@ -745,8 +745,13 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
  
                              // Ensure distance do not become so small that r^-12 overflows
                              r2 = std::max(r2, c_nbnxnMinDistanceSquared);
  
                              // Ensure distance do not become so small that r^-12 overflows
                              r2 = std::max(r2, c_nbnxnMinDistanceSquared);
+#if GMX_SYCL_HIPSYCL
+                            // No fast/native functions in some compilation passes
+                            const float rInv = cl::sycl::rsqrt(r2);
+#else
                              // SYCL-TODO: sycl::half_precision::rsqrt?
                              // SYCL-TODO: sycl::half_precision::rsqrt?
-                            const float rInv  = cl::sycl::native::rsqrt(r2);
+                            const float rInv = cl::sycl::native::rsqrt(r2);
+#endif
                              const float r2Inv = rInv * rInv;
                              float       r6Inv, fInvR, energyLJPair;
                              if constexpr (!props.vdwCombLB || doCalcEnergies)
                              const float r2Inv = rInv * rInv;
                              float       r6Inv, fInvR, energyLJPair;
                              if constexpr (!props.vdwCombLB || doCalcEnergies)
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp

index 62f4bb7592c835e07733cd37c4648a194cbc3a6f..d784de3a2cf3369cdeb37fd092b632f4b5a5d22e 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
@@ -91,7 +91,7 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
       * For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness,
       * but it causes very poor performance.
       */
       * For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness,
       * but it causes very poor performance.
       */
-    constexpr int requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
+    constexpr int gmx_unused requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
  
      /* Requirements:
       * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
  
      /* Requirements:
       * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h

index c3aae4ebb50d930b7e14cbb506ba04ac91ba5a8c..46c9e38afe28bdb05bb5c1ad322fb16e331ec014 100644 (file)
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
@@ -41,6 +41,7 @@
  #ifndef GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
  #define GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
  
  #ifndef GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
  #define GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
  
+#include "gromacs/gpu_utils/sycl_kernel_utils.h"
  #include "gromacs/nbnxm/pairlist.h"
  #include "gromacs/nbnxm/pairlistparams.h"
  
  #include "gromacs/nbnxm/pairlist.h"
  #include "gromacs/nbnxm/pairlistparams.h"
  
@@ -100,28 +101,6 @@ static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d)
      return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z);
  }
  
      return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z);
  }
  
-//! \brief Convenience wrapper to do atomic addition to a global buffer
-template<cl::sycl::access::mode Mode, class IndexType>
-static inline void atomicFetchAdd(DeviceAccessor<float, Mode> acc, const IndexType idx, const float val)
-{
-    if (cl::sycl::isnormal(val))
-    {
-        sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
-                fout_atomic(acc[idx]);
-        fout_atomic.fetch_add(val);
-    }
-}
-
-static inline float shuffleDown(float var, unsigned int delta, sycl_2020::sub_group sg)
-{
-    return sg.shuffle_down(var, delta);
-}
-
-static inline float shuffleUp(float var, unsigned int delta, sycl_2020::sub_group sg)
-{
-    return sg.shuffle_up(var, delta);
-}
-
  } // namespace Nbnxm
  
  #endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
  } // namespace Nbnxm
  
  #endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
author	Andrey Alekseenko <al42and@gmail.com>
	Tue, 2 Mar 2021 07:30:21 +0000 (07:30 +0000)
committer	Mark Abraham <mark.j.abraham@gmail.com>
	Tue, 2 Mar 2021 07:30:21 +0000 (07:30 +0000)
admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml		patch \| blob \| history
admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml	[new file with mode: 0644]	patch \| blob
cmake/gmxManageSYCL.cmake		patch \| blob \| history
src/config.h.cmakein		patch \| blob \| history
src/gromacs/gpu_utils/device_stream_sycl.cpp		patch \| blob \| history
src/gromacs/gpu_utils/gmxsycl.h		patch \| blob \| history
src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h		patch \| blob \| history
src/gromacs/gpu_utils/sycl_kernel_utils.h	[new file with mode: 0644]	patch \| blob
src/gromacs/hardware/device_management_sycl.cpp		patch \| blob \| history
src/gromacs/mdlib/CMakeLists.txt		patch \| blob \| history
src/gromacs/mdlib/tests/leapfrogtestrunners.h		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp		patch \| blob \| history
src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h		patch \| blob \| history