From 1a21c4596d1548734f3991f812f6ea742796c51a Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Tue, 2 Mar 2021 07:30:21 +0000
Subject: [PATCH] Set up build with hipSYCL

The kernel code is currently non-functional on NVIDIA/AMD hardware, but
the build itself is useful to ensure code portability.

- Disabled LeapFrog for hipSYCL. With `float3` being used in the host
  code, it is easiest to just disable the integrator until #3312 is
  resolved. This will have to be resolved in order to run full MD loop
  on GPU with hipSYCL. See #3941.

- Some warnings from the compiler and include headers silenced.

- Workarounds for features not supported by hipSYCL are introduced.
- - Shuffles. Should work, but not tested. Hopefully, they will be
    introduced in the mainline hipSYCL soon.
- - Barrier synchronization. Intel-specific extension to SYCL which can
    be imitated fairly well with native OpenCL/CUDA calls, but is
    stubbed very stupidly now. Proper solution will be done after #2527
    and #3924 are resolved.

- CI jobs are added. The set of target device architectures is chosen
  semi-arbitrarily, since the code is not being run yet anyway. Main
  concern was to have both HIP and CUDA there.

Refs #3923.
---
 admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml  |   1 +
 .../gromacs.hipsycl-dev.gitlab-ci.yml         |  44 +++++
 cmake/gmxManageSYCL.cmake                     |   4 +
 src/config.h.cmakein                          |   6 +
 src/gromacs/gpu_utils/device_stream_sycl.cpp  |   7 +-
 src/gromacs/gpu_utils/gmxsycl.h               |  41 ++++-
 .../gpu_utils/gpueventsynchronizer_sycl.h     |  10 ++
 src/gromacs/gpu_utils/sycl_kernel_utils.h     | 168 ++++++++++++++++++
 .../hardware/device_management_sycl.cpp       |  12 ++
 src/gromacs/mdlib/CMakeLists.txt              |   4 +-
 src/gromacs/mdlib/tests/leapfrogtestrunners.h |   9 +-
 src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp  |  57 +++---
 .../sycl/nbnxm_sycl_kernel_pruneonly.cpp      |   2 +-
 .../nbnxm/sycl/nbnxm_sycl_kernel_utils.h      |  23 +--
 14 files changed, 328 insertions(+), 60 deletions(-)
 create mode 100644 admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml
 create mode 100644 src/gromacs/gpu_utils/sycl_kernel_utils.h

diff --git a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml
index 75d9cac850..bab6c38544 100644
--- a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml
+++ b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml
@@ -271,6 +271,7 @@ include:
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0-release.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-9-release.gitlab-ci.yml'
+  - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl-release.gitlab-ci.yml'
   - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-sycl.gitlab-ci.yml'
diff --git a/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml
new file mode 100644
index 0000000000..ba7c33ca99
--- /dev/null
+++ b/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml
@@ -0,0 +1,44 @@
+# Test goal: build with hipSYCL (both CUDA and ROCm backends) to check SYCL code compatibility
+# Test intents (should change rarely and conservatively):
+#   OS: Ubuntu newest supported
+#   Compiler: Clang newest supported
+#   GPU: hipSYCL
+#   Scope: configure, build
+# Test implementation choices (free to change as needed):
+#   OS: Ubuntu 20.04
+#   Build type: RelWithAssert
+#   Compiler: Clang 11
+#   MPI: thread_MPI
+#   SIMD: AVX2_256
+
+gromacs:hipsycl-dev:configure:
+  extends:
+    - .gromacs:base:configure
+    - .use-clang:base
+    - .use-sycl
+    - .rules:merge-and-post-merge-acceptance
+  image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+  variables:
+    CMAKE: /usr/local/cmake-3.18.4/bin/cmake
+    CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=AVX2_256"
+    CMAKE_BUILD_TYPE_OPTIONS: "-DCMAKE_BUILD_TYPE=RelWithAssert"
+    CMAKE_GPU_OPTIONS: "-DGMX_GPU=SYCL -DGMX_SYCL_HIPSYCL=ON -DHIPSYCL_TARGETS='cuda:sm_60,sm_61,sm_70,sm_75;hip:gfx900'"
+    # Unset COMPILER_LAUNCHER (previously set to ccache) because it conflicts with hipSYCL's syclcc-launcher
+    CMAKE_EXTRA_OPTIONS: "-DCMAKE_C_COMPILER_LAUNCHER= -DCMAKE_CXX_COMPILER_LAUNCHER="
+    COMPILER_MAJOR_VERSION: 11
+
+gromacs:hipsycl-dev:build:
+  extends:
+    - .variables:default
+    - .gromacs:base:build
+    - .before_script:default
+    # Not using ccache because it plays poorly with syclcc-launcher
+    - .rules:merge-and-post-merge-acceptance
+  image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+  variables:
+    CMAKE: /usr/local/cmake-3.18.4/bin/cmake
+  tags:
+    - k8s-scilifelab
+  needs:
+    - job: gromacs:hipsycl-dev:configure
+
diff --git a/cmake/gmxManageSYCL.cmake b/cmake/gmxManageSYCL.cmake
index af133a12fa..cf51174536 100644
--- a/cmake/gmxManageSYCL.cmake
+++ b/cmake/gmxManageSYCL.cmake
@@ -55,6 +55,10 @@ if(GMX_SYCL_HIPSYCL)
         message(FATAL_ERROR "hipSYCL can only be built with Clang++ compiler")
     endif()
     set(HIPSYCL_CLANG "${CMAKE_CXX_COMPILER}")
+    # -Wno-unknown-cuda-version because Clang-11 complains about CUDA 11.0-11.2, despite working fine with them.
+    # -Wno-unknown-attributes because hipSYCL does not support reqd_sub_group_size (because it can only do some sub group sizes).
+    # --hipsycl-explicit-multipass is needed when building for both CUDA and HIP.
+    set(HIPSYCL_SYCLCC_EXTRA_ARGS "-Wno-unknown-cuda-version -Wno-unknown-attributes --hipsycl-explicit-multipass")
     find_package(hipsycl REQUIRED)
 else()
     if(CMAKE_CXX_COMPILER MATCHES "dpcpp")
diff --git a/src/config.h.cmakein b/src/config.h.cmakein
index 47797c46c6..2c1da097f0 100644
--- a/src/config.h.cmakein
+++ b/src/config.h.cmakein
@@ -227,6 +227,12 @@
 /* Define if SYCL GPU acceleration is compiled */
 #cmakedefine01 GMX_GPU_SYCL
 
+/* Define if hipSYCL is used for SYCL support (otherwise DPCPP is assumed) */
+#cmakedefine01 GMX_SYCL_HIPSYCL
+
+/* Define if Intel's DPCPP is used (default unless hipSYCL is chosen) */
+#define GMX_SYCL_DPCPP (!GMX_SYCL_HIPSYCL)
+
 /* Use a single compilation unit when compiling the CUDA (non-bonded) kernels.  */
 #cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
 
diff --git a/src/gromacs/gpu_utils/device_stream_sycl.cpp b/src/gromacs/gpu_utils/device_stream_sycl.cpp
index 72a431e121..48c6217ab7 100644
--- a/src/gromacs/gpu_utils/device_stream_sycl.cpp
+++ b/src/gromacs/gpu_utils/device_stream_sycl.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -60,7 +60,12 @@ DeviceStream::DeviceStream(const DeviceContext& deviceContext,
         const bool deviceSupportsTiming = device.get_info<cl::sycl::info::device::queue_profiling>();
         if (deviceSupportsTiming)
         {
+#if (!GMX_SYCL_HIPSYCL)
+            /* Support for profiling and even the `::enable_profile` property is added in
+             * https://github.com/illuhad/hipSYCL/pull/428, which is not merged at the
+             * time of writing */
             propertyList = cl::sycl::property::queue::enable_profiling();
+#endif
         }
     }
     stream_ = cl::sycl::queue(deviceContext.context(), device, propertyList);
diff --git a/src/gromacs/gpu_utils/gmxsycl.h b/src/gromacs/gpu_utils/gmxsycl.h
index 8eb5cabc9f..77891f937d 100644
--- a/src/gromacs/gpu_utils/gmxsycl.h
+++ b/src/gromacs/gpu_utils/gmxsycl.h
@@ -52,11 +52,13 @@
 #ifndef GMX_GPU_UTILS_GMXSYCL_H
 #define GMX_GPU_UTILS_GMXSYCL_H
 
+#include "config.h"
+
 /* Some versions of Intel ICPX compiler (at least 2021.1.1 and 2021.1.2) fail to unroll a loop
  * in sycl::accessor::__init, and emit -Wpass-failed=transform-warning. This is a useful
  * warning, but mostly noise right now. Probably related to using shared memory accessors.
  * The unroll directive was introduced in https://github.com/intel/llvm/pull/2449. */
-#if defined(__INTEL_LLVM_COMPILER)
+#if GMX_SYCL_DPCPP
 #    include <CL/sycl/version.hpp>
 #    define DISABLE_UNROLL_WARNINGS \
         ((__SYCL_COMPILER_VERSION >= 20201113) && (__SYCL_COMPILER_VERSION <= 20201214))
@@ -69,6 +71,29 @@
 #    pragma clang diagnostic ignored "-Wpass-failed"
 #endif
 
+// For hipSYCL, we need to activate floating-point atomics
+#if GMX_SYCL_HIPSYCL
+#    define HIPSYCL_EXT_FP_ATOMICS
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wunused-variable"
+#    pragma clang diagnostic ignored "-Wunused-parameter"
+#    pragma clang diagnostic ignored "-Wmissing-noreturn"
+#    pragma clang diagnostic ignored "-Wshadow-field"
+#    pragma clang diagnostic ignored "-Wctad-maybe-unsupported"
+#    pragma clang diagnostic ignored "-Wdeprecated-copy-dtor"
+#    pragma clang diagnostic ignored "-Winconsistent-missing-destructor-override"
+#    pragma clang diagnostic ignored "-Wunused-template"
+#    pragma clang diagnostic ignored "-Wsign-compare"
+#    pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
+#    pragma clang diagnostic ignored "-Wdeprecated-copy"
+#    pragma clang diagnostic ignored "-Wnewline-eof"
+#    pragma clang diagnostic ignored "-Wextra-semi"
+#    pragma clang diagnostic ignored "-Wsuggest-override"
+#    pragma clang diagnostic ignored "-Wsuggest-destructor-override"
+#    pragma clang diagnostic ignored "-Wgcc-compat"
+#endif
+
+
 #ifdef DIM
 #    if DIM != 3
 #        error "The workaround here assumes we use DIM=3."
@@ -85,6 +110,10 @@
 #    pragma clang diagnostic pop
 #endif
 
+#if GMX_SYCL_HIPSYCL
+#    pragma clang diagnostic pop
+#endif
+
 #undef DISABLE_UNROLL_WARNINGS
 
 /* Exposing Intel-specific extensions in a manner compatible with SYCL2020 provisional spec.
@@ -97,10 +126,10 @@ namespace sycl_2020
 {
 namespace detail
 {
-#if defined(__SYCL_COMPILER_VERSION) // Intel DPCPP compiler
+#if GMX_SYCL_DPCPP
 // Confirmed to work for 2021.1-beta10 (20201005), 2021.1.1 (20201113), 2021.1.2 (20201214).
 namespace origin = cl::sycl::ONEAPI;
-#elif defined(__HIPSYCL__)
+#elif GMX_SYCL_HIPSYCL
 namespace origin = cl::sycl;
 #else
 #    error "Unsupported version of SYCL compiler"
@@ -112,7 +141,7 @@ using detail::origin::memory_scope;
 using detail::origin::plus;
 using detail::origin::sub_group;
 
-#if defined(__SYCL_COMPILER_VERSION) // Intel DPCPP compiler
+#if GMX_SYCL_DPCPP
 using detail::origin::atomic_ref;
 template<typename... Args>
 bool group_any_of(Args&&... args)
@@ -124,8 +153,8 @@ auto group_reduce(Args&&... args) -> decltype(detail::origin::reduce(std::forwar
 {
     return detail::origin::reduce(std::forward<Args>(args)...);
 }
-#elif defined(__HIPSYCL__)
-// No atomic_ref in hipSYCL yet (2021-01-29)
+#elif GMX_SYCL_HIPSYCL
+// No atomic_ref in hipSYCL yet (2021-02-22)
 using detail::origin::group_any_of;
 using detail::origin::group_reduce;
 #else
diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
index cdbed7cc1b..952fedf242 100644
--- a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
+++ b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h
@@ -98,16 +98,22 @@ public:
      */
     inline void markEvent(const DeviceStream& deviceStream)
     {
+#    if GMX_SYCL_HIPSYCL
+        deviceStream.stream().wait_and_throw(); // SYCL-TODO: Use CUDA/HIP-specific solutions
+#    else
         GMX_ASSERT(!event_.has_value(), "Do not call markEvent more than once!");
         // Relies on SYCL_INTEL_enqueue_barrier
         event_ = deviceStream.stream().submit_barrier();
+#    endif
     }
     /*! \brief Synchronizes the host thread on the marked event.
      * As in the OpenCL implementation, the event is released.
      */
     inline void waitForEvent()
     {
+#    if (!GMX_SYCL_HIPSYCL)
         event_->wait_and_throw();
+#    endif
         event_.reset();
     }
     /*! \brief Checks the completion of the underlying event and resets the object if it was. */
@@ -126,10 +132,14 @@ public:
      */
     inline void enqueueWaitEvent(const DeviceStream& deviceStream)
     {
+#    if GMX_SYCL_HIPSYCL
+        deviceStream.stream().wait_and_throw(); // SYCL-TODO: Use CUDA/HIP-specific solutions
+#    else
         // Relies on SYCL_INTEL_enqueue_barrier
         const std::vector<cl::sycl::event> waitlist{ event_.value() };
         deviceStream.stream().submit_barrier(waitlist);
         event_.reset();
+#    endif
     }
     //! Reset the event to unmarked state.
     inline void reset() { event_.reset(); }
diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h
new file mode 100644
index 0000000000..9896a51d3e
--- /dev/null
+++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h
@@ -0,0 +1,168 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H
+#define GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H
+
+#include "gmxsycl.h"
+
+/*! \file
+ *  \brief SYCL kernel helper functions.
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ */
+
+/*! \brief Access mode to use for atomic accessors.
+ *
+ * Intel DPCPP compiler has \c sycl::atomic_ref, but has no \c sycl::atomic_fetch_add for floats.
+ * However, \c atomic_ref can not be constructed from \c sycl::atomic, so we can not use
+ * atomic accessors. Thus, we use \c mode::read_write accessors and \c atomic_ref.
+ *
+ * hipSYCL does not have \c sycl::atomic_ref, but has \c sycl::atomic_fetch_add for floats, which
+ * requires using atomic accessors. Thus, we use \c mode::atomic accessors.
+ *
+ * The \ref atomicFetchAdd function could be used for doing operations on such accessors.
+ */
+static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
+                                                   /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
+
+/*! \brief Convenience wrapper to do atomic addition to a global buffer.
+ *
+ * Skips sub-normal values.
+ *
+ * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
+ */
+template<class IndexType>
+static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const IndexType idx, const float val)
+{
+#if GMX_SYCL_DPCPP
+    if (cl::sycl::isnormal(val))
+    {
+        sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
+                fout_atomic(acc[idx]);
+        fout_atomic.fetch_add(val);
+    }
+#elif GMX_SYCL_HIPSYCL
+    if (std::isnormal(val)) // No sycl::isnormal in hipSYCL
+    {
+#    ifdef SYCL_DEVICE_ONLY
+        /* While there is support for float atomics on device, the host implementation uses
+         * Clang's __atomic_fetch_add intrinsic, that, at least in Clang 11, does not support
+         * floats. Luckily, we don't want to run on host. */
+        acc[idx].fetch_add(val);
+#    else
+        GMX_UNUSED_VALUE(acc);
+        GMX_UNUSED_VALUE(idx);
+#    endif
+    }
+#endif
+}
+
+namespace sycl_2020
+{
+#if GMX_SYCL_HIPSYCL
+__device__ static inline float shift_left(sycl_2020::sub_group, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+    // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet
+#    ifdef SYCL_DEVICE_ONLY
+#        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
+    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
+    return __shfl_down_sync(sc_cudaFullWarpMask, var, delta);
+#        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
+    // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
+    return __shfl_down(var, delta);
+#        else
+#            error "Unsupported hipSYCL target"
+#        endif
+#    else
+    // Should never be called
+    GMX_UNUSED_VALUE(var);
+    GMX_UNUSED_VALUE(delta);
+    assert(false);
+    return NAN;
+#    endif
+}
+__host__ static inline float shift_left(sycl_2020::sub_group, float, sycl_2020::sub_group::linear_id_type)
+{
+    // Should never be called
+    assert(false);
+    return NAN;
+}
+#elif GMX_SYCL_DPCPP
+static inline float shift_left(sycl_2020::sub_group sg, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+    return sg.shuffle_down(var, delta);
+}
+#endif
+
+#if GMX_SYCL_HIPSYCL
+__device__ static inline float shift_right(sycl_2020::sub_group,
+                                           float                                var,
+                                           sycl_2020::sub_group::linear_id_type delta)
+{
+    // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet
+#    ifdef SYCL_DEVICE_ONLY
+#        if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
+    static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
+    return __shfl_up_sync(sc_cudaFullWarpMask, var, delta);
+#        elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
+    // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
+    return __shfl_up(var, delta);
+#        else
+#            error "Unsupported hipSYCL target"
+#        endif
+#    else
+    // Should never be called
+    assert(false);
+    GMX_UNUSED_VALUE(var);
+    GMX_UNUSED_VALUE(delta);
+    return NAN;
+#    endif
+}
+__host__ static inline float shift_right(sycl_2020::sub_group, float, sycl_2020::sub_group::linear_id_type)
+{
+    // Should never be called
+    assert(false);
+    return NAN;
+}
+#elif GMX_SYCL_DPCPP
+static inline float shift_right(sycl_2020::sub_group sg, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+    return sg.shuffle_up(var, delta);
+}
+#endif
+} // namespace sycl_2020
+
+#endif /* GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H */
diff --git a/src/gromacs/hardware/device_management_sycl.cpp b/src/gromacs/hardware/device_management_sycl.cpp
index 4fd401e04b..37d739d433 100644
--- a/src/gromacs/hardware/device_management_sycl.cpp
+++ b/src/gromacs/hardware/device_management_sycl.cpp
@@ -103,8 +103,20 @@ static DeviceStatus isDeviceCompatible(const cl::sycl::device& syclDevice)
         // Host device does not support subgroups or even querying for sub_group_sizes
         return DeviceStatus::Incompatible;
     }
+
+#if GMX_SYCL_HIPSYCL
+    /* At the time of writing:
+     * 1. SYCL NB kernels currently don't support sub_group size of 32 or 64, which are the only
+     * ones available on NVIDIA and AMD hardware, respectively. That's not a fundamental limitation,
+     * but requires porting more OpenCL code, see #3934.
+     * 2. hipSYCL does not support cl::sycl::info::device::sub_group_sizes,
+     * see https://github.com/illuhad/hipSYCL/pull/449
+     */
+    const std::vector<size_t> supportedSubGroupSizes{ warpSize };
+#else
     const std::vector<size_t> supportedSubGroupSizes =
             syclDevice.get_info<cl::sycl::info::device::sub_group_sizes>();
+#endif
     const size_t requiredSubGroupSizeForNBNXM = 8;
     if (std::find(supportedSubGroupSizes.begin(), supportedSubGroupSizes.end(), requiredSubGroupSizeForNBNXM)
         == supportedSubGroupSizes.end())
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index f1d575bfbe..886679bf82 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -2,7 +2,7 @@
 # This file is part of the GROMACS molecular simulation package.
 #
 # Copyright (c) 2010,2012,2013,2014,2015 by the GROMACS development team.
-# Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -50,7 +50,7 @@ if(GMX_GPU_CUDA)
        )
 endif()
 
-if(GMX_GPU_SYCL)
+if(GMX_GPU_SYCL AND NOT GMX_SYCL_HIPSYCL)
     gmx_add_libgromacs_sources(
         leapfrog_gpu_sycl.cpp
     )
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.h b/src/gromacs/mdlib/tests/leapfrogtestrunners.h
index 7590aa2dfa..5c0d0cb740 100644
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners.h
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -54,7 +54,12 @@
 
 #include "leapfrogtestdata.h"
 
-#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || GMX_GPU_SYCL)
+/*
+ * LeapFrog is available with CUDA and SYCL.
+ * However, the use of float3 in CPU code makes it hard to compile with hipSYCL until
+ * Issue #3312 is resolved.
+ */
+#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || (GMX_GPU_SYCL && !GMX_SYCL_HIPSYCL))
 
 namespace gmx
 {
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
index 770732c70c..a372afa775 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -301,25 +301,25 @@ static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::r
     return lerp(left, right, fraction); // TODO: cl::sycl::mix
 }
 
-static inline void reduceForceJShuffle(Float3                                  f,
-                                       const cl::sycl::nd_item<1>              itemIdx,
-                                       const int                               tidxi,
-                                       const int                               aidx,
-                                       DeviceAccessor<float, mode::read_write> a_f)
+static inline void reduceForceJShuffle(Float3                             f,
+                                       const cl::sycl::nd_item<1>         itemIdx,
+                                       const int                          tidxi,
+                                       const int                          aidx,
+                                       DeviceAccessor<float, mode_atomic> a_f)
 {
     static_assert(c_clSize == 8 || c_clSize == 4);
     sycl_2020::sub_group sg = itemIdx.get_sub_group();
 
-    f[0] += shuffleDown(f[0], 1, sg);
-    f[1] += shuffleUp(f[1], 1, sg);
-    f[2] += shuffleDown(f[2], 1, sg);
+    f[0] += sycl_2020::shift_left(sg, f[0], 1);
+    f[1] += sycl_2020::shift_right(sg, f[1], 1);
+    f[2] += sycl_2020::shift_left(sg, f[2], 1);
     if (tidxi & 1)
     {
         f[0] = f[1];
     }
 
-    f[0] += shuffleDown(f[0], 2, sg);
-    f[2] += shuffleUp(f[2], 2, sg);
+    f[0] += sycl_2020::shift_left(sg, f[0], 2);
+    f[2] += sycl_2020::shift_right(sg, f[2], 2);
     if (tidxi & 2)
     {
         f[0] = f[2];
@@ -327,7 +327,7 @@ static inline void reduceForceJShuffle(Float3                                  f
 
     if constexpr (c_clSize == 8)
     {
-        f[0] += shuffleDown(f[0], 4, sg);
+        f[0] += sycl_2020::shift_left(sg, f[0], 4);
     }
 
     if (tidxi < 3)
@@ -344,13 +344,13 @@ static inline void reduceForceJShuffle(Float3                                  f
 static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
                                          const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
                                          const bool   calcFShift,
-                                         const cl::sycl::nd_item<1>              itemIdx,
-                                         const int                               tidxi,
-                                         const int                               tidxj,
-                                         const int                               sci,
-                                         const int                               shift,
-                                         DeviceAccessor<float, mode::read_write> a_f,
-                                         DeviceAccessor<float, mode::read_write> a_fShift)
+                                         const cl::sycl::nd_item<1>         itemIdx,
+                                         const int                          tidxi,
+                                         const int                          tidxj,
+                                         const int                          sci,
+                                         const int                          shift,
+                                         DeviceAccessor<float, mode_atomic> a_f,
+                                         DeviceAccessor<float, mode_atomic> a_fShift)
 {
     static constexpr int bufStride  = c_clSize * c_clSize;
     static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
@@ -417,13 +417,13 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read
  *
  */
 template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
-auto nbnxmKernel(cl::sycl::handler&                                        cgh,
-                 DeviceAccessor<Float4, mode::read>                        a_xq,
-                 DeviceAccessor<float, mode::read_write>                   a_f,
-                 DeviceAccessor<Float3, mode::read>                        a_shiftVec,
-                 DeviceAccessor<float, mode::read_write>                   a_fShift,
-                 OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
-                 OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
+auto nbnxmKernel(cl::sycl::handler&                                   cgh,
+                 DeviceAccessor<Float4, mode::read>                   a_xq,
+                 DeviceAccessor<float, mode_atomic>                   a_f,
+                 DeviceAccessor<Float3, mode::read>                   a_shiftVec,
+                 DeviceAccessor<float, mode_atomic>                   a_fShift,
+                 OptionalAccessor<float, mode_atomic, doCalcEnergies> a_energyElec,
+                 OptionalAccessor<float, mode_atomic, doCalcEnergies> a_energyVdw,
                  DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
                  DeviceAccessor<nbnxn_sci_t, mode::read>                                 a_plistSci,
                  DeviceAccessor<nbnxn_excl_t, mode::read>                    a_plistExcl,
@@ -745,8 +745,13 @@ auto nbnxmKernel(cl::sycl::handler&                                        cgh,
 
                             // Ensure distance do not become so small that r^-12 overflows
                             r2 = std::max(r2, c_nbnxnMinDistanceSquared);
+#if GMX_SYCL_HIPSYCL
+                            // No fast/native functions in some compilation passes
+                            const float rInv = cl::sycl::rsqrt(r2);
+#else
                             // SYCL-TODO: sycl::half_precision::rsqrt?
-                            const float rInv  = cl::sycl::native::rsqrt(r2);
+                            const float rInv = cl::sycl::native::rsqrt(r2);
+#endif
                             const float r2Inv = rInv * rInv;
                             float       r6Inv, fInvR, energyLJPair;
                             if constexpr (!props.vdwCombLB || doCalcEnergies)
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
index 62f4bb7592..d784de3a2c 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
@@ -91,7 +91,7 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
      * For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness,
      * but it causes very poor performance.
      */
-    constexpr int requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
+    constexpr int gmx_unused requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
 
     /* Requirements:
      * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
index c3aae4ebb5..46c9e38afe 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
@@ -41,6 +41,7 @@
 #ifndef GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
 #define GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
 
+#include "gromacs/gpu_utils/sycl_kernel_utils.h"
 #include "gromacs/nbnxm/pairlist.h"
 #include "gromacs/nbnxm/pairlistparams.h"
 
@@ -100,28 +101,6 @@ static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d)
     return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z);
 }
 
-//! \brief Convenience wrapper to do atomic addition to a global buffer
-template<cl::sycl::access::mode Mode, class IndexType>
-static inline void atomicFetchAdd(DeviceAccessor<float, Mode> acc, const IndexType idx, const float val)
-{
-    if (cl::sycl::isnormal(val))
-    {
-        sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
-                fout_atomic(acc[idx]);
-        fout_atomic.fetch_add(val);
-    }
-}
-
-static inline float shuffleDown(float var, unsigned int delta, sycl_2020::sub_group sg)
-{
-    return sg.shuffle_down(var, delta);
-}
-
-static inline float shuffleUp(float var, unsigned int delta, sycl_2020::sub_group sg)
-{
-    return sg.shuffle_up(var, delta);
-}
-
 } // namespace Nbnxm
 
 #endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
-- 
2.22.0