- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0-release.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-9-release.gitlab-ci.yml'
+ - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl-release.gitlab-ci.yml'
- local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-sycl.gitlab-ci.yml'
--- /dev/null
+# Test goal: build with hipSYCL (both CUDA and ROCm backends) to check SYCL code compatibility
+# Test intents (should change rarely and conservatively):
+# OS: Ubuntu newest supported
+# Compiler: Clang newest supported
+# GPU: hipSYCL
+# Scope: configure, build
+# Test implementation choices (free to change as needed):
+# OS: Ubuntu 20.04
+# Build type: RelWithAssert
+# Compiler: Clang 11
+# MPI: thread_MPI
+# SIMD: AVX2_256
+
+gromacs:hipsycl-dev:configure:
+ extends:
+ - .gromacs:base:configure
+ - .use-clang:base
+ - .use-sycl
+ - .rules:merge-and-post-merge-acceptance
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+ variables:
+ CMAKE: /usr/local/cmake-3.18.4/bin/cmake
+ CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=AVX2_256"
+ CMAKE_BUILD_TYPE_OPTIONS: "-DCMAKE_BUILD_TYPE=RelWithAssert"
+ CMAKE_GPU_OPTIONS: "-DGMX_GPU=SYCL -DGMX_SYCL_HIPSYCL=ON -DHIPSYCL_TARGETS='cuda:sm_60,sm_61,sm_70,sm_75;hip:gfx900'"
+ # Unset COMPILER_LAUNCHER (previously set to ccache) because it conflicts with hipSYCL's syclcc-launcher
+ CMAKE_EXTRA_OPTIONS: "-DCMAKE_C_COMPILER_LAUNCHER= -DCMAKE_CXX_COMPILER_LAUNCHER="
+ COMPILER_MAJOR_VERSION: 11
+
+gromacs:hipsycl-dev:build:
+ extends:
+ - .variables:default
+ - .gromacs:base:build
+ - .before_script:default
+ # Not using ccache because it plays poorly with syclcc-launcher
+ - .rules:merge-and-post-merge-acceptance
+ image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a
+ variables:
+ CMAKE: /usr/local/cmake-3.18.4/bin/cmake
+ tags:
+ - k8s-scilifelab
+ needs:
+ - job: gromacs:hipsycl-dev:configure
+
message(FATAL_ERROR "hipSYCL can only be built with Clang++ compiler")
endif()
set(HIPSYCL_CLANG "${CMAKE_CXX_COMPILER}")
+ # -Wno-unknown-cuda-version because Clang-11 complains about CUDA 11.0-11.2, despite working fine with them.
+ # -Wno-unknown-attributes because hipSYCL does not support reqd_sub_group_size (because it can only do some sub group sizes).
+ # --hipsycl-explicit-multipass is needed when building for both CUDA and HIP.
+ set(HIPSYCL_SYCLCC_EXTRA_ARGS "-Wno-unknown-cuda-version -Wno-unknown-attributes --hipsycl-explicit-multipass")
find_package(hipsycl REQUIRED)
else()
if(CMAKE_CXX_COMPILER MATCHES "dpcpp")
/* Define if SYCL GPU acceleration is compiled */
#cmakedefine01 GMX_GPU_SYCL
+/* Define if hipSYCL is used for SYCL support (otherwise DPCPP is assumed) */
+#cmakedefine01 GMX_SYCL_HIPSYCL
+
+/* Define if Intel's DPCPP is used (default unless hipSYCL is chosen) */
+#define GMX_SYCL_DPCPP (!GMX_SYCL_HIPSYCL)
+
/* Use a single compilation unit when compiling the CUDA (non-bonded) kernels. */
#cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2020, by the GROMACS development team, led by
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
const bool deviceSupportsTiming = device.get_info<cl::sycl::info::device::queue_profiling>();
if (deviceSupportsTiming)
{
+#if (!GMX_SYCL_HIPSYCL)
+ /* Support for profiling and even the `::enable_profile` property is added in
+ * https://github.com/illuhad/hipSYCL/pull/428, which is not merged at the
+ * time of writing */
propertyList = cl::sycl::property::queue::enable_profiling();
+#endif
}
}
stream_ = cl::sycl::queue(deviceContext.context(), device, propertyList);
#ifndef GMX_GPU_UTILS_GMXSYCL_H
#define GMX_GPU_UTILS_GMXSYCL_H
+#include "config.h"
+
/* Some versions of Intel ICPX compiler (at least 2021.1.1 and 2021.1.2) fail to unroll a loop
* in sycl::accessor::__init, and emit -Wpass-failed=transform-warning. This is a useful
* warning, but mostly noise right now. Probably related to using shared memory accessors.
* The unroll directive was introduced in https://github.com/intel/llvm/pull/2449. */
-#if defined(__INTEL_LLVM_COMPILER)
+#if GMX_SYCL_DPCPP
# include <CL/sycl/version.hpp>
# define DISABLE_UNROLL_WARNINGS \
((__SYCL_COMPILER_VERSION >= 20201113) && (__SYCL_COMPILER_VERSION <= 20201214))
# pragma clang diagnostic ignored "-Wpass-failed"
#endif
+// For hipSYCL, we need to activate floating-point atomics
+#if GMX_SYCL_HIPSYCL
+# define HIPSYCL_EXT_FP_ATOMICS
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wunused-variable"
+# pragma clang diagnostic ignored "-Wunused-parameter"
+# pragma clang diagnostic ignored "-Wmissing-noreturn"
+# pragma clang diagnostic ignored "-Wshadow-field"
+# pragma clang diagnostic ignored "-Wctad-maybe-unsupported"
+# pragma clang diagnostic ignored "-Wdeprecated-copy-dtor"
+# pragma clang diagnostic ignored "-Winconsistent-missing-destructor-override"
+# pragma clang diagnostic ignored "-Wunused-template"
+# pragma clang diagnostic ignored "-Wsign-compare"
+# pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
+# pragma clang diagnostic ignored "-Wdeprecated-copy"
+# pragma clang diagnostic ignored "-Wnewline-eof"
+# pragma clang diagnostic ignored "-Wextra-semi"
+# pragma clang diagnostic ignored "-Wsuggest-override"
+# pragma clang diagnostic ignored "-Wsuggest-destructor-override"
+# pragma clang diagnostic ignored "-Wgcc-compat"
+#endif
+
+
#ifdef DIM
# if DIM != 3
# error "The workaround here assumes we use DIM=3."
# pragma clang diagnostic pop
#endif
+#if GMX_SYCL_HIPSYCL
+# pragma clang diagnostic pop
+#endif
+
#undef DISABLE_UNROLL_WARNINGS
/* Exposing Intel-specific extensions in a manner compatible with SYCL2020 provisional spec.
{
namespace detail
{
-#if defined(__SYCL_COMPILER_VERSION) // Intel DPCPP compiler
+#if GMX_SYCL_DPCPP
// Confirmed to work for 2021.1-beta10 (20201005), 2021.1.1 (20201113), 2021.1.2 (20201214).
namespace origin = cl::sycl::ONEAPI;
-#elif defined(__HIPSYCL__)
+#elif GMX_SYCL_HIPSYCL
namespace origin = cl::sycl;
#else
# error "Unsupported version of SYCL compiler"
using detail::origin::plus;
using detail::origin::sub_group;
-#if defined(__SYCL_COMPILER_VERSION) // Intel DPCPP compiler
+#if GMX_SYCL_DPCPP
using detail::origin::atomic_ref;
template<typename... Args>
bool group_any_of(Args&&... args)
{
return detail::origin::reduce(std::forward<Args>(args)...);
}
-#elif defined(__HIPSYCL__)
-// No atomic_ref in hipSYCL yet (2021-01-29)
+#elif GMX_SYCL_HIPSYCL
+// No atomic_ref in hipSYCL yet (2021-02-22)
using detail::origin::group_any_of;
using detail::origin::group_reduce;
#else
*/
inline void markEvent(const DeviceStream& deviceStream)
{
+# if GMX_SYCL_HIPSYCL
+ deviceStream.stream().wait_and_throw(); // SYCL-TODO: Use CUDA/HIP-specific solutions
+# else
GMX_ASSERT(!event_.has_value(), "Do not call markEvent more than once!");
// Relies on SYCL_INTEL_enqueue_barrier
event_ = deviceStream.stream().submit_barrier();
+# endif
}
/*! \brief Synchronizes the host thread on the marked event.
* As in the OpenCL implementation, the event is released.
*/
inline void waitForEvent()
{
+# if (!GMX_SYCL_HIPSYCL)
event_->wait_and_throw();
+# endif
event_.reset();
}
/*! \brief Checks the completion of the underlying event and resets the object if it was. */
*/
inline void enqueueWaitEvent(const DeviceStream& deviceStream)
{
+# if GMX_SYCL_HIPSYCL
+ deviceStream.stream().wait_and_throw(); // SYCL-TODO: Use CUDA/HIP-specific solutions
+# else
// Relies on SYCL_INTEL_enqueue_barrier
const std::vector<cl::sycl::event> waitlist{ event_.value() };
deviceStream.stream().submit_barrier(waitlist);
event_.reset();
+# endif
}
//! Reset the event to unmarked state.
inline void reset() { event_.reset(); }
--- /dev/null
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+#ifndef GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H
+#define GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H
+
+#include "gmxsycl.h"
+
+/*! \file
+ * \brief SYCL kernel helper functions.
+ *
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ */
+
+/*! \brief Access mode to use for atomic accessors.
+ *
+ * Intel DPCPP compiler has \c sycl::atomic_ref, but has no \c sycl::atomic_fetch_add for floats.
+ * However, \c atomic_ref can not be constructed from \c sycl::atomic, so we can not use
+ * atomic accessors. Thus, we use \c mode::read_write accessors and \c atomic_ref.
+ *
+ * hipSYCL does not have \c sycl::atomic_ref, but has \c sycl::atomic_fetch_add for floats, which
+ * requires using atomic accessors. Thus, we use \c mode::atomic accessors.
+ *
+ * The \ref atomicFetchAdd function could be used for doing operations on such accessors.
+ */
+static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write :
+ /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic;
+
+/*! \brief Convenience wrapper to do atomic addition to a global buffer.
+ *
+ * Skips sub-normal values.
+ *
+ * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic.
+ */
+template<class IndexType>
+static inline void atomicFetchAdd(DeviceAccessor<float, mode_atomic> acc, const IndexType idx, const float val)
+{
+#if GMX_SYCL_DPCPP
+ if (cl::sycl::isnormal(val))
+ {
+ sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
+ fout_atomic(acc[idx]);
+ fout_atomic.fetch_add(val);
+ }
+#elif GMX_SYCL_HIPSYCL
+ if (std::isnormal(val)) // No sycl::isnormal in hipSYCL
+ {
+# ifdef SYCL_DEVICE_ONLY
+ /* While there is support for float atomics on device, the host implementation uses
+ * Clang's __atomic_fetch_add intrinsic, that, at least in Clang 11, does not support
+ * floats. Luckily, we don't want to run on host. */
+ acc[idx].fetch_add(val);
+# else
+ GMX_UNUSED_VALUE(acc);
+ GMX_UNUSED_VALUE(idx);
+# endif
+ }
+#endif
+}
+
+namespace sycl_2020
+{
+#if GMX_SYCL_HIPSYCL
+__device__ static inline float shift_left(sycl_2020::sub_group, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+ // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet
+# ifdef SYCL_DEVICE_ONLY
+# if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
+ static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
+ return __shfl_down_sync(sc_cudaFullWarpMask, var, delta);
+# elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
+ // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
+ return __shfl_down(var, delta);
+# else
+# error "Unsupported hipSYCL target"
+# endif
+# else
+ // Should never be called
+ GMX_UNUSED_VALUE(var);
+ GMX_UNUSED_VALUE(delta);
+ assert(false);
+ return NAN;
+# endif
+}
+__host__ static inline float shift_left(sycl_2020::sub_group, float, sycl_2020::sub_group::linear_id_type)
+{
+ // Should never be called
+ assert(false);
+ return NAN;
+}
+#elif GMX_SYCL_DPCPP
+static inline float shift_left(sycl_2020::sub_group sg, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+ return sg.shuffle_down(var, delta);
+}
+#endif
+
+#if GMX_SYCL_HIPSYCL
+__device__ static inline float shift_right(sycl_2020::sub_group,
+ float var,
+ sycl_2020::sub_group::linear_id_type delta)
+{
+ // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet
+# ifdef SYCL_DEVICE_ONLY
+# if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__)
+ static const unsigned int sc_cudaFullWarpMask = 0xffffffff;
+ return __shfl_up_sync(sc_cudaFullWarpMask, var, delta);
+# elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__)
+ // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491
+ return __shfl_up(var, delta);
+# else
+# error "Unsupported hipSYCL target"
+# endif
+# else
+ // Should never be called
+ assert(false);
+ GMX_UNUSED_VALUE(var);
+ GMX_UNUSED_VALUE(delta);
+ return NAN;
+# endif
+}
+__host__ static inline float shift_right(sycl_2020::sub_group, float, sycl_2020::sub_group::linear_id_type)
+{
+ // Should never be called
+ assert(false);
+ return NAN;
+}
+#elif GMX_SYCL_DPCPP
+static inline float shift_right(sycl_2020::sub_group sg, float var, sycl_2020::sub_group::linear_id_type delta)
+{
+ return sg.shuffle_up(var, delta);
+}
+#endif
+} // namespace sycl_2020
+
+#endif /* GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H */
// Host device does not support subgroups or even querying for sub_group_sizes
return DeviceStatus::Incompatible;
}
+
+#if GMX_SYCL_HIPSYCL
+ /* At the time of writing:
+ * 1. SYCL NB kernels currently don't support sub_group size of 32 or 64, which are the only
+ * ones available on NVIDIA and AMD hardware, respectively. That's not a fundamental limitation,
+ * but requires porting more OpenCL code, see #3934.
+ * 2. hipSYCL does not support cl::sycl::info::device::sub_group_sizes,
+ * see https://github.com/illuhad/hipSYCL/pull/449
+ */
+ const std::vector<size_t> supportedSubGroupSizes{ warpSize };
+#else
const std::vector<size_t> supportedSubGroupSizes =
syclDevice.get_info<cl::sycl::info::device::sub_group_sizes>();
+#endif
const size_t requiredSubGroupSizeForNBNXM = 8;
if (std::find(supportedSubGroupSizes.begin(), supportedSubGroupSizes.end(), requiredSubGroupSizeForNBNXM)
== supportedSubGroupSizes.end())
# This file is part of the GROMACS molecular simulation package.
#
# Copyright (c) 2010,2012,2013,2014,2015 by the GROMACS development team.
-# Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
# and including many others, as listed in the AUTHORS file in the
# top-level source directory and at http://www.gromacs.org.
)
endif()
-if(GMX_GPU_SYCL)
+if(GMX_GPU_SYCL AND NOT GMX_SYCL_HIPSYCL)
gmx_add_libgromacs_sources(
leapfrog_gpu_sycl.cpp
)
/*
* This file is part of the GROMACS molecular simulation package.
*
- * Copyright (c) 2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
* Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
* and including many others, as listed in the AUTHORS file in the
* top-level source directory and at http://www.gromacs.org.
#include "leapfrogtestdata.h"
-#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || GMX_GPU_SYCL)
+/*
+ * LeapFrog is available with CUDA and SYCL.
+ * However, the use of float3 in CPU code makes it hard to compile with hipSYCL until
+ * Issue #3312 is resolved.
+ */
+#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || (GMX_GPU_SYCL && !GMX_SYCL_HIPSYCL))
namespace gmx
{
return lerp(left, right, fraction); // TODO: cl::sycl::mix
}
-static inline void reduceForceJShuffle(Float3 f,
- const cl::sycl::nd_item<1> itemIdx,
- const int tidxi,
- const int aidx,
- DeviceAccessor<float, mode::read_write> a_f)
+static inline void reduceForceJShuffle(Float3 f,
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int aidx,
+ DeviceAccessor<float, mode_atomic> a_f)
{
static_assert(c_clSize == 8 || c_clSize == 4);
sycl_2020::sub_group sg = itemIdx.get_sub_group();
- f[0] += shuffleDown(f[0], 1, sg);
- f[1] += shuffleUp(f[1], 1, sg);
- f[2] += shuffleDown(f[2], 1, sg);
+ f[0] += sycl_2020::shift_left(sg, f[0], 1);
+ f[1] += sycl_2020::shift_right(sg, f[1], 1);
+ f[2] += sycl_2020::shift_left(sg, f[2], 1);
if (tidxi & 1)
{
f[0] = f[1];
}
- f[0] += shuffleDown(f[0], 2, sg);
- f[2] += shuffleUp(f[2], 2, sg);
+ f[0] += sycl_2020::shift_left(sg, f[0], 2);
+ f[2] += sycl_2020::shift_right(sg, f[2], 2);
if (tidxi & 2)
{
f[0] = f[2];
if constexpr (c_clSize == 8)
{
- f[0] += shuffleDown(f[0], 4, sg);
+ f[0] += sycl_2020::shift_left(sg, f[0], 4);
}
if (tidxi < 3)
static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
const bool calcFShift,
- const cl::sycl::nd_item<1> itemIdx,
- const int tidxi,
- const int tidxj,
- const int sci,
- const int shift,
- DeviceAccessor<float, mode::read_write> a_f,
- DeviceAccessor<float, mode::read_write> a_fShift)
+ const cl::sycl::nd_item<1> itemIdx,
+ const int tidxi,
+ const int tidxj,
+ const int sci,
+ const int shift,
+ DeviceAccessor<float, mode_atomic> a_f,
+ DeviceAccessor<float, mode_atomic> a_fShift)
{
static constexpr int bufStride = c_clSize * c_clSize;
static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
*
*/
template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
-auto nbnxmKernel(cl::sycl::handler& cgh,
- DeviceAccessor<Float4, mode::read> a_xq,
- DeviceAccessor<float, mode::read_write> a_f,
- DeviceAccessor<Float3, mode::read> a_shiftVec,
- DeviceAccessor<float, mode::read_write> a_fShift,
- OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
- OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
+auto nbnxmKernel(cl::sycl::handler& cgh,
+ DeviceAccessor<Float4, mode::read> a_xq,
+ DeviceAccessor<float, mode_atomic> a_f,
+ DeviceAccessor<Float3, mode::read> a_shiftVec,
+ DeviceAccessor<float, mode_atomic> a_fShift,
+ OptionalAccessor<float, mode_atomic, doCalcEnergies> a_energyElec,
+ OptionalAccessor<float, mode_atomic, doCalcEnergies> a_energyVdw,
DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
DeviceAccessor<nbnxn_sci_t, mode::read> a_plistSci,
DeviceAccessor<nbnxn_excl_t, mode::read> a_plistExcl,
// Ensure distance do not become so small that r^-12 overflows
r2 = std::max(r2, c_nbnxnMinDistanceSquared);
+#if GMX_SYCL_HIPSYCL
+ // No fast/native functions in some compilation passes
+ const float rInv = cl::sycl::rsqrt(r2);
+#else
// SYCL-TODO: sycl::half_precision::rsqrt?
- const float rInv = cl::sycl::native::rsqrt(r2);
+ const float rInv = cl::sycl::native::rsqrt(r2);
+#endif
const float r2Inv = rInv * rInv;
float r6Inv, fInvR, energyLJPair;
if constexpr (!props.vdwCombLB || doCalcEnergies)
* For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness,
* but it causes very poor performance.
*/
- constexpr int requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
+ constexpr int gmx_unused requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
/* Requirements:
* Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
#ifndef GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
#define GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
+#include "gromacs/gpu_utils/sycl_kernel_utils.h"
#include "gromacs/nbnxm/pairlist.h"
#include "gromacs/nbnxm/pairlistparams.h"
return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z);
}
-//! \brief Convenience wrapper to do atomic addition to a global buffer
-template<cl::sycl::access::mode Mode, class IndexType>
-static inline void atomicFetchAdd(DeviceAccessor<float, Mode> acc, const IndexType idx, const float val)
-{
- if (cl::sycl::isnormal(val))
- {
- sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
- fout_atomic(acc[idx]);
- fout_atomic.fetch_add(val);
- }
-}
-
-static inline float shuffleDown(float var, unsigned int delta, sycl_2020::sub_group sg)
-{
- return sg.shuffle_down(var, delta);
-}
-
-static inline float shuffleUp(float var, unsigned int delta, sycl_2020::sub_group sg)
-{
- return sg.shuffle_up(var, delta);
-}
-
} // namespace Nbnxm
#endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H