From 1a21c4596d1548734f3991f812f6ea742796c51a Mon Sep 17 00:00:00 2001 From: Andrey Alekseenko Date: Tue, 2 Mar 2021 07:30:21 +0000 Subject: [PATCH] Set up build with hipSYCL The kernel code is currently non-functional on NVIDIA/AMD hardware, but the build itself is useful to ensure code portability. - Disabled LeapFrog for hipSYCL. With `float3` being used in the host code, it is easiest to just disable the integrator until #3312 is resolved. This will have to be resolved in order to run full MD loop on GPU with hipSYCL. See #3941. - Some warnings from the compiler and include headers silenced. - Workarounds for features not supported by hipSYCL are introduced. - - Shuffles. Should work, but not tested. Hopefully, they will be introduced in the mainline hipSYCL soon. - - Barrier synchronization. Intel-specific extension to SYCL which can be imitated fairly well with native OpenCL/CUDA calls, but is stubbed very stupidly now. Proper solution will be done after #2527 and #3924 are resolved. - CI jobs are added. The set of target device architectures is chosen semi-arbitrarily, since the code is not being run yet anyway. Main concern was to have both HIP and CUDA there. Refs #3923. --- admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml | 1 + .../gromacs.hipsycl-dev.gitlab-ci.yml | 44 +++++ cmake/gmxManageSYCL.cmake | 4 + src/config.h.cmakein | 6 + src/gromacs/gpu_utils/device_stream_sycl.cpp | 7 +- src/gromacs/gpu_utils/gmxsycl.h | 41 ++++- .../gpu_utils/gpueventsynchronizer_sycl.h | 10 ++ src/gromacs/gpu_utils/sycl_kernel_utils.h | 168 ++++++++++++++++++ .../hardware/device_management_sycl.cpp | 12 ++ src/gromacs/mdlib/CMakeLists.txt | 4 +- src/gromacs/mdlib/tests/leapfrogtestrunners.h | 9 +- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp | 57 +++--- .../sycl/nbnxm_sycl_kernel_pruneonly.cpp | 2 +- .../nbnxm/sycl/nbnxm_sycl_kernel_utils.h | 23 +-- 14 files changed, 328 insertions(+), 60 deletions(-) create mode 100644 admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml create mode 100644 src/gromacs/gpu_utils/sycl_kernel_utils.h diff --git a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml index 75d9cac850..bab6c38544 100644 --- a/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml +++ b/admin/gitlab-ci/gromacs.matrix.gitlab-ci.yml @@ -271,6 +271,7 @@ include: - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-8-cuda-11.0-release.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.gcc-9-release.gitlab-ci.yml' + - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-opencl-release.gitlab-ci.yml' - local: '/admin/gitlab-ci/gromacs.matrix/gromacs.oneapi-2021.1.1-sycl.gitlab-ci.yml' diff --git a/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml b/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml new file mode 100644 index 0000000000..ba7c33ca99 --- /dev/null +++ b/admin/gitlab-ci/gromacs.matrix/gromacs.hipsycl-dev.gitlab-ci.yml @@ -0,0 +1,44 @@ +# Test goal: build with hipSYCL (both CUDA and ROCm backends) to check SYCL code compatibility +# Test intents (should change rarely and conservatively): +# OS: Ubuntu newest supported +# Compiler: Clang newest supported +# GPU: hipSYCL +# Scope: configure, build +# Test implementation choices (free to change as needed): +# OS: Ubuntu 20.04 +# Build type: RelWithAssert +# Compiler: Clang 11 +# MPI: thread_MPI +# SIMD: AVX2_256 + +gromacs:hipsycl-dev:configure: + extends: + - .gromacs:base:configure + - .use-clang:base + - .use-sycl + - .rules:merge-and-post-merge-acceptance + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a + variables: + CMAKE: /usr/local/cmake-3.18.4/bin/cmake + CMAKE_SIMD_OPTIONS: "-DGMX_SIMD=AVX2_256" + CMAKE_BUILD_TYPE_OPTIONS: "-DCMAKE_BUILD_TYPE=RelWithAssert" + CMAKE_GPU_OPTIONS: "-DGMX_GPU=SYCL -DGMX_SYCL_HIPSYCL=ON -DHIPSYCL_TARGETS='cuda:sm_60,sm_61,sm_70,sm_75;hip:gfx900'" + # Unset COMPILER_LAUNCHER (previously set to ccache) because it conflicts with hipSYCL's syclcc-launcher + CMAKE_EXTRA_OPTIONS: "-DCMAKE_C_COMPILER_LAUNCHER= -DCMAKE_CXX_COMPILER_LAUNCHER=" + COMPILER_MAJOR_VERSION: 11 + +gromacs:hipsycl-dev:build: + extends: + - .variables:default + - .gromacs:base:build + - .before_script:default + # Not using ccache because it plays poorly with syclcc-launcher + - .rules:merge-and-post-merge-acceptance + image: ${CI_REGISTRY}/gromacs/gromacs/ci-ubuntu-20.04-llvm-11-cuda-11.2.1-hipsycl-2bc21b677a + variables: + CMAKE: /usr/local/cmake-3.18.4/bin/cmake + tags: + - k8s-scilifelab + needs: + - job: gromacs:hipsycl-dev:configure + diff --git a/cmake/gmxManageSYCL.cmake b/cmake/gmxManageSYCL.cmake index af133a12fa..cf51174536 100644 --- a/cmake/gmxManageSYCL.cmake +++ b/cmake/gmxManageSYCL.cmake @@ -55,6 +55,10 @@ if(GMX_SYCL_HIPSYCL) message(FATAL_ERROR "hipSYCL can only be built with Clang++ compiler") endif() set(HIPSYCL_CLANG "${CMAKE_CXX_COMPILER}") + # -Wno-unknown-cuda-version because Clang-11 complains about CUDA 11.0-11.2, despite working fine with them. + # -Wno-unknown-attributes because hipSYCL does not support reqd_sub_group_size (because it can only do some sub group sizes). + # --hipsycl-explicit-multipass is needed when building for both CUDA and HIP. + set(HIPSYCL_SYCLCC_EXTRA_ARGS "-Wno-unknown-cuda-version -Wno-unknown-attributes --hipsycl-explicit-multipass") find_package(hipsycl REQUIRED) else() if(CMAKE_CXX_COMPILER MATCHES "dpcpp") diff --git a/src/config.h.cmakein b/src/config.h.cmakein index 47797c46c6..2c1da097f0 100644 --- a/src/config.h.cmakein +++ b/src/config.h.cmakein @@ -227,6 +227,12 @@ /* Define if SYCL GPU acceleration is compiled */ #cmakedefine01 GMX_GPU_SYCL +/* Define if hipSYCL is used for SYCL support (otherwise DPCPP is assumed) */ +#cmakedefine01 GMX_SYCL_HIPSYCL + +/* Define if Intel's DPCPP is used (default unless hipSYCL is chosen) */ +#define GMX_SYCL_DPCPP (!GMX_SYCL_HIPSYCL) + /* Use a single compilation unit when compiling the CUDA (non-bonded) kernels. */ #cmakedefine01 GMX_CUDA_NB_SINGLE_COMPILATION_UNIT diff --git a/src/gromacs/gpu_utils/device_stream_sycl.cpp b/src/gromacs/gpu_utils/device_stream_sycl.cpp index 72a431e121..48c6217ab7 100644 --- a/src/gromacs/gpu_utils/device_stream_sycl.cpp +++ b/src/gromacs/gpu_utils/device_stream_sycl.cpp @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2020, by the GROMACS development team, led by + * Copyright (c) 2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -60,7 +60,12 @@ DeviceStream::DeviceStream(const DeviceContext& deviceContext, const bool deviceSupportsTiming = device.get_info(); if (deviceSupportsTiming) { +#if (!GMX_SYCL_HIPSYCL) + /* Support for profiling and even the `::enable_profile` property is added in + * https://github.com/illuhad/hipSYCL/pull/428, which is not merged at the + * time of writing */ propertyList = cl::sycl::property::queue::enable_profiling(); +#endif } } stream_ = cl::sycl::queue(deviceContext.context(), device, propertyList); diff --git a/src/gromacs/gpu_utils/gmxsycl.h b/src/gromacs/gpu_utils/gmxsycl.h index 8eb5cabc9f..77891f937d 100644 --- a/src/gromacs/gpu_utils/gmxsycl.h +++ b/src/gromacs/gpu_utils/gmxsycl.h @@ -52,11 +52,13 @@ #ifndef GMX_GPU_UTILS_GMXSYCL_H #define GMX_GPU_UTILS_GMXSYCL_H +#include "config.h" + /* Some versions of Intel ICPX compiler (at least 2021.1.1 and 2021.1.2) fail to unroll a loop * in sycl::accessor::__init, and emit -Wpass-failed=transform-warning. This is a useful * warning, but mostly noise right now. Probably related to using shared memory accessors. * The unroll directive was introduced in https://github.com/intel/llvm/pull/2449. */ -#if defined(__INTEL_LLVM_COMPILER) +#if GMX_SYCL_DPCPP # include # define DISABLE_UNROLL_WARNINGS \ ((__SYCL_COMPILER_VERSION >= 20201113) && (__SYCL_COMPILER_VERSION <= 20201214)) @@ -69,6 +71,29 @@ # pragma clang diagnostic ignored "-Wpass-failed" #endif +// For hipSYCL, we need to activate floating-point atomics +#if GMX_SYCL_HIPSYCL +# define HIPSYCL_EXT_FP_ATOMICS +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wunused-variable" +# pragma clang diagnostic ignored "-Wunused-parameter" +# pragma clang diagnostic ignored "-Wmissing-noreturn" +# pragma clang diagnostic ignored "-Wshadow-field" +# pragma clang diagnostic ignored "-Wctad-maybe-unsupported" +# pragma clang diagnostic ignored "-Wdeprecated-copy-dtor" +# pragma clang diagnostic ignored "-Winconsistent-missing-destructor-override" +# pragma clang diagnostic ignored "-Wunused-template" +# pragma clang diagnostic ignored "-Wsign-compare" +# pragma clang diagnostic ignored "-Wundefined-reinterpret-cast" +# pragma clang diagnostic ignored "-Wdeprecated-copy" +# pragma clang diagnostic ignored "-Wnewline-eof" +# pragma clang diagnostic ignored "-Wextra-semi" +# pragma clang diagnostic ignored "-Wsuggest-override" +# pragma clang diagnostic ignored "-Wsuggest-destructor-override" +# pragma clang diagnostic ignored "-Wgcc-compat" +#endif + + #ifdef DIM # if DIM != 3 # error "The workaround here assumes we use DIM=3." @@ -85,6 +110,10 @@ # pragma clang diagnostic pop #endif +#if GMX_SYCL_HIPSYCL +# pragma clang diagnostic pop +#endif + #undef DISABLE_UNROLL_WARNINGS /* Exposing Intel-specific extensions in a manner compatible with SYCL2020 provisional spec. @@ -97,10 +126,10 @@ namespace sycl_2020 { namespace detail { -#if defined(__SYCL_COMPILER_VERSION) // Intel DPCPP compiler +#if GMX_SYCL_DPCPP // Confirmed to work for 2021.1-beta10 (20201005), 2021.1.1 (20201113), 2021.1.2 (20201214). namespace origin = cl::sycl::ONEAPI; -#elif defined(__HIPSYCL__) +#elif GMX_SYCL_HIPSYCL namespace origin = cl::sycl; #else # error "Unsupported version of SYCL compiler" @@ -112,7 +141,7 @@ using detail::origin::memory_scope; using detail::origin::plus; using detail::origin::sub_group; -#if defined(__SYCL_COMPILER_VERSION) // Intel DPCPP compiler +#if GMX_SYCL_DPCPP using detail::origin::atomic_ref; template bool group_any_of(Args&&... args) @@ -124,8 +153,8 @@ auto group_reduce(Args&&... args) -> decltype(detail::origin::reduce(std::forwar { return detail::origin::reduce(std::forward(args)...); } -#elif defined(__HIPSYCL__) -// No atomic_ref in hipSYCL yet (2021-01-29) +#elif GMX_SYCL_HIPSYCL +// No atomic_ref in hipSYCL yet (2021-02-22) using detail::origin::group_any_of; using detail::origin::group_reduce; #else diff --git a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h index cdbed7cc1b..952fedf242 100644 --- a/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h +++ b/src/gromacs/gpu_utils/gpueventsynchronizer_sycl.h @@ -98,16 +98,22 @@ public: */ inline void markEvent(const DeviceStream& deviceStream) { +# if GMX_SYCL_HIPSYCL + deviceStream.stream().wait_and_throw(); // SYCL-TODO: Use CUDA/HIP-specific solutions +# else GMX_ASSERT(!event_.has_value(), "Do not call markEvent more than once!"); // Relies on SYCL_INTEL_enqueue_barrier event_ = deviceStream.stream().submit_barrier(); +# endif } /*! \brief Synchronizes the host thread on the marked event. * As in the OpenCL implementation, the event is released. */ inline void waitForEvent() { +# if (!GMX_SYCL_HIPSYCL) event_->wait_and_throw(); +# endif event_.reset(); } /*! \brief Checks the completion of the underlying event and resets the object if it was. */ @@ -126,10 +132,14 @@ public: */ inline void enqueueWaitEvent(const DeviceStream& deviceStream) { +# if GMX_SYCL_HIPSYCL + deviceStream.stream().wait_and_throw(); // SYCL-TODO: Use CUDA/HIP-specific solutions +# else // Relies on SYCL_INTEL_enqueue_barrier const std::vector waitlist{ event_.value() }; deviceStream.stream().submit_barrier(waitlist); event_.reset(); +# endif } //! Reset the event to unmarked state. inline void reset() { event_.reset(); } diff --git a/src/gromacs/gpu_utils/sycl_kernel_utils.h b/src/gromacs/gpu_utils/sycl_kernel_utils.h new file mode 100644 index 0000000000..9896a51d3e --- /dev/null +++ b/src/gromacs/gpu_utils/sycl_kernel_utils.h @@ -0,0 +1,168 @@ +/* + * This file is part of the GROMACS molecular simulation package. + * + * Copyright (c) 2021, by the GROMACS development team, led by + * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, + * and including many others, as listed in the AUTHORS file in the + * top-level source directory and at http://www.gromacs.org. + * + * GROMACS is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 + * of the License, or (at your option) any later version. + * + * GROMACS is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with GROMACS; if not, see + * http://www.gnu.org/licenses, or write to the Free Software Foundation, + * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * If you want to redistribute modifications to GROMACS, please + * consider that scientific software is very special. Version + * control is crucial - bugs must be traceable. We will be happy to + * consider code for inclusion in the official distribution, but + * derived work must not be called official GROMACS. Details are found + * in the README & COPYING files - if they are missing, get the + * official version at http://www.gromacs.org. + * + * To help us fund GROMACS development, we humbly ask that you cite + * the research papers on the package. Check out http://www.gromacs.org. + */ + +#ifndef GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H +#define GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H + +#include "gmxsycl.h" + +/*! \file + * \brief SYCL kernel helper functions. + * + * \author Andrey Alekseenko + */ + +/*! \brief Access mode to use for atomic accessors. + * + * Intel DPCPP compiler has \c sycl::atomic_ref, but has no \c sycl::atomic_fetch_add for floats. + * However, \c atomic_ref can not be constructed from \c sycl::atomic, so we can not use + * atomic accessors. Thus, we use \c mode::read_write accessors and \c atomic_ref. + * + * hipSYCL does not have \c sycl::atomic_ref, but has \c sycl::atomic_fetch_add for floats, which + * requires using atomic accessors. Thus, we use \c mode::atomic accessors. + * + * The \ref atomicFetchAdd function could be used for doing operations on such accessors. + */ +static constexpr auto mode_atomic = GMX_SYCL_DPCPP ? cl::sycl::access::mode::read_write : + /* GMX_SYCL_HIPSYCL */ cl::sycl::access::mode::atomic; + +/*! \brief Convenience wrapper to do atomic addition to a global buffer. + * + * Skips sub-normal values. + * + * The implementation differences between DPCPP and hipSYCL are explained in \ref mode_atomic. + */ +template +static inline void atomicFetchAdd(DeviceAccessor acc, const IndexType idx, const float val) +{ +#if GMX_SYCL_DPCPP + if (cl::sycl::isnormal(val)) + { + sycl_2020::atomic_ref + fout_atomic(acc[idx]); + fout_atomic.fetch_add(val); + } +#elif GMX_SYCL_HIPSYCL + if (std::isnormal(val)) // No sycl::isnormal in hipSYCL + { +# ifdef SYCL_DEVICE_ONLY + /* While there is support for float atomics on device, the host implementation uses + * Clang's __atomic_fetch_add intrinsic, that, at least in Clang 11, does not support + * floats. Luckily, we don't want to run on host. */ + acc[idx].fetch_add(val); +# else + GMX_UNUSED_VALUE(acc); + GMX_UNUSED_VALUE(idx); +# endif + } +#endif +} + +namespace sycl_2020 +{ +#if GMX_SYCL_HIPSYCL +__device__ static inline float shift_left(sycl_2020::sub_group, float var, sycl_2020::sub_group::linear_id_type delta) +{ + // No sycl::sub_group::shift_left / shuffle_down in hipSYCL yet +# ifdef SYCL_DEVICE_ONLY +# if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__) + static const unsigned int sc_cudaFullWarpMask = 0xffffffff; + return __shfl_down_sync(sc_cudaFullWarpMask, var, delta); +# elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__) + // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491 + return __shfl_down(var, delta); +# else +# error "Unsupported hipSYCL target" +# endif +# else + // Should never be called + GMX_UNUSED_VALUE(var); + GMX_UNUSED_VALUE(delta); + assert(false); + return NAN; +# endif +} +__host__ static inline float shift_left(sycl_2020::sub_group, float, sycl_2020::sub_group::linear_id_type) +{ + // Should never be called + assert(false); + return NAN; +} +#elif GMX_SYCL_DPCPP +static inline float shift_left(sycl_2020::sub_group sg, float var, sycl_2020::sub_group::linear_id_type delta) +{ + return sg.shuffle_down(var, delta); +} +#endif + +#if GMX_SYCL_HIPSYCL +__device__ static inline float shift_right(sycl_2020::sub_group, + float var, + sycl_2020::sub_group::linear_id_type delta) +{ + // No sycl::sub_group::shift_right / shuffle_up in hipSYCL yet +# ifdef SYCL_DEVICE_ONLY +# if defined(HIPSYCL_PLATFORM_CUDA) && defined(__HIPSYCL_ENABLE_CUDA_TARGET__) + static const unsigned int sc_cudaFullWarpMask = 0xffffffff; + return __shfl_up_sync(sc_cudaFullWarpMask, var, delta); +# elif defined(HIPSYCL_PLATFORM_ROCM) && defined(__HIPSYCL_ENABLE_HIP_TARGET__) + // Do we need more ifdefs? https://github.com/ROCm-Developer-Tools/HIP/issues/1491 + return __shfl_up(var, delta); +# else +# error "Unsupported hipSYCL target" +# endif +# else + // Should never be called + assert(false); + GMX_UNUSED_VALUE(var); + GMX_UNUSED_VALUE(delta); + return NAN; +# endif +} +__host__ static inline float shift_right(sycl_2020::sub_group, float, sycl_2020::sub_group::linear_id_type) +{ + // Should never be called + assert(false); + return NAN; +} +#elif GMX_SYCL_DPCPP +static inline float shift_right(sycl_2020::sub_group sg, float var, sycl_2020::sub_group::linear_id_type delta) +{ + return sg.shuffle_up(var, delta); +} +#endif +} // namespace sycl_2020 + +#endif /* GMX_GPU_UTILS_SYCL_KERNEL_UTILS_H */ diff --git a/src/gromacs/hardware/device_management_sycl.cpp b/src/gromacs/hardware/device_management_sycl.cpp index 4fd401e04b..37d739d433 100644 --- a/src/gromacs/hardware/device_management_sycl.cpp +++ b/src/gromacs/hardware/device_management_sycl.cpp @@ -103,8 +103,20 @@ static DeviceStatus isDeviceCompatible(const cl::sycl::device& syclDevice) // Host device does not support subgroups or even querying for sub_group_sizes return DeviceStatus::Incompatible; } + +#if GMX_SYCL_HIPSYCL + /* At the time of writing: + * 1. SYCL NB kernels currently don't support sub_group size of 32 or 64, which are the only + * ones available on NVIDIA and AMD hardware, respectively. That's not a fundamental limitation, + * but requires porting more OpenCL code, see #3934. + * 2. hipSYCL does not support cl::sycl::info::device::sub_group_sizes, + * see https://github.com/illuhad/hipSYCL/pull/449 + */ + const std::vector supportedSubGroupSizes{ warpSize }; +#else const std::vector supportedSubGroupSizes = syclDevice.get_info(); +#endif const size_t requiredSubGroupSizeForNBNXM = 8; if (std::find(supportedSubGroupSizes.begin(), supportedSubGroupSizes.end(), requiredSubGroupSizeForNBNXM) == supportedSubGroupSizes.end()) diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt index f1d575bfbe..886679bf82 100644 --- a/src/gromacs/mdlib/CMakeLists.txt +++ b/src/gromacs/mdlib/CMakeLists.txt @@ -2,7 +2,7 @@ # This file is part of the GROMACS molecular simulation package. # # Copyright (c) 2010,2012,2013,2014,2015 by the GROMACS development team. -# Copyright (c) 2018,2019,2020, by the GROMACS development team, led by +# Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, # and including many others, as listed in the AUTHORS file in the # top-level source directory and at http://www.gromacs.org. @@ -50,7 +50,7 @@ if(GMX_GPU_CUDA) ) endif() -if(GMX_GPU_SYCL) +if(GMX_GPU_SYCL AND NOT GMX_SYCL_HIPSYCL) gmx_add_libgromacs_sources( leapfrog_gpu_sycl.cpp ) diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.h b/src/gromacs/mdlib/tests/leapfrogtestrunners.h index 7590aa2dfa..5c0d0cb740 100644 --- a/src/gromacs/mdlib/tests/leapfrogtestrunners.h +++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.h @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2019,2020, by the GROMACS development team, led by + * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -54,7 +54,12 @@ #include "leapfrogtestdata.h" -#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || GMX_GPU_SYCL) +/* + * LeapFrog is available with CUDA and SYCL. + * However, the use of float3 in CPU code makes it hard to compile with hipSYCL until + * Issue #3312 is resolved. + */ +#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || (GMX_GPU_SYCL && !GMX_SYCL_HIPSYCL)) namespace gmx { diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index 770732c70c..a372afa775 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -301,25 +301,25 @@ static inline float interpolateCoulombForceR(const DeviceAccessor itemIdx, - const int tidxi, - const int aidx, - DeviceAccessor a_f) +static inline void reduceForceJShuffle(Float3 f, + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int aidx, + DeviceAccessor a_f) { static_assert(c_clSize == 8 || c_clSize == 4); sycl_2020::sub_group sg = itemIdx.get_sub_group(); - f[0] += shuffleDown(f[0], 1, sg); - f[1] += shuffleUp(f[1], 1, sg); - f[2] += shuffleDown(f[2], 1, sg); + f[0] += sycl_2020::shift_left(sg, f[0], 1); + f[1] += sycl_2020::shift_right(sg, f[1], 1); + f[2] += sycl_2020::shift_left(sg, f[2], 1); if (tidxi & 1) { f[0] = f[1]; } - f[0] += shuffleDown(f[0], 2, sg); - f[2] += shuffleUp(f[2], 2, sg); + f[0] += sycl_2020::shift_left(sg, f[0], 2); + f[2] += sycl_2020::shift_right(sg, f[2], 2); if (tidxi & 2) { f[0] = f[2]; @@ -327,7 +327,7 @@ static inline void reduceForceJShuffle(Float3 f if constexpr (c_clSize == 8) { - f[0] += shuffleDown(f[0], 4, sg); + f[0] += sycl_2020::shift_left(sg, f[0], 4); } if (tidxi < 3) @@ -344,13 +344,13 @@ static inline void reduceForceJShuffle(Float3 f static inline void reduceForceIAndFShift(cl::sycl::accessor sm_buf, const Float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster], const bool calcFShift, - const cl::sycl::nd_item<1> itemIdx, - const int tidxi, - const int tidxj, - const int sci, - const int shift, - DeviceAccessor a_f, - DeviceAccessor a_fShift) + const cl::sycl::nd_item<1> itemIdx, + const int tidxi, + const int tidxj, + const int sci, + const int shift, + DeviceAccessor a_f, + DeviceAccessor a_fShift) { static constexpr int bufStride = c_clSize * c_clSize; static constexpr int clSizeLog2 = gmx::StaticLog2::value; @@ -417,13 +417,13 @@ static inline void reduceForceIAndFShift(cl::sycl::accessor -auto nbnxmKernel(cl::sycl::handler& cgh, - DeviceAccessor a_xq, - DeviceAccessor a_f, - DeviceAccessor a_shiftVec, - DeviceAccessor a_fShift, - OptionalAccessor a_energyElec, - OptionalAccessor a_energyVdw, +auto nbnxmKernel(cl::sycl::handler& cgh, + DeviceAccessor a_xq, + DeviceAccessor a_f, + DeviceAccessor a_shiftVec, + DeviceAccessor a_fShift, + OptionalAccessor a_energyElec, + OptionalAccessor a_energyVdw, DeviceAccessor a_plistCJ4, DeviceAccessor a_plistSci, DeviceAccessor a_plistExcl, @@ -745,8 +745,13 @@ auto nbnxmKernel(cl::sycl::handler& cgh, // Ensure distance do not become so small that r^-12 overflows r2 = std::max(r2, c_nbnxnMinDistanceSquared); +#if GMX_SYCL_HIPSYCL + // No fast/native functions in some compilation passes + const float rInv = cl::sycl::rsqrt(r2); +#else // SYCL-TODO: sycl::half_precision::rsqrt? - const float rInv = cl::sycl::native::rsqrt(r2); + const float rInv = cl::sycl::native::rsqrt(r2); +#endif const float r2Inv = rInv * rInv; float r6Inv, fInvR, energyLJPair; if constexpr (!props.vdwCombLB || doCalcEnergies) diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp index 62f4bb7592..d784de3a2c 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp @@ -91,7 +91,7 @@ auto nbnxmKernelPruneOnly(cl::sycl::handler& cgh, * For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness, * but it causes very poor performance. */ - constexpr int requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize; + constexpr int gmx_unused requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize; /* Requirements: * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h index c3aae4ebb5..46c9e38afe 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h @@ -41,6 +41,7 @@ #ifndef GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H #define GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H +#include "gromacs/gpu_utils/sycl_kernel_utils.h" #include "gromacs/nbnxm/pairlist.h" #include "gromacs/nbnxm/pairlistparams.h" @@ -100,28 +101,6 @@ static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d) return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z); } -//! \brief Convenience wrapper to do atomic addition to a global buffer -template -static inline void atomicFetchAdd(DeviceAccessor acc, const IndexType idx, const float val) -{ - if (cl::sycl::isnormal(val)) - { - sycl_2020::atomic_ref - fout_atomic(acc[idx]); - fout_atomic.fetch_add(val); - } -} - -static inline float shuffleDown(float var, unsigned int delta, sycl_2020::sub_group sg) -{ - return sg.shuffle_down(var, delta); -} - -static inline float shuffleUp(float var, unsigned int delta, sycl_2020::sub_group sg) -{ - return sg.shuffle_up(var, delta); -} - } // namespace Nbnxm #endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H -- 2.22.0