From 92e1eccca1e1f6a687d11915cf391217fdeeb8ad Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Mon, 9 Nov 2020 17:57:21 +0000
Subject: [PATCH] Add SYCL implementation of LeapFrogGpu

An implementation of LeapFrogGpu integrator using SYCL. Also, some additions to SYCL's DeviceBuffer.

Code pretty much follows CUDA version.

For float3, I had to use gmx::RVec instead of SYCL's built-in cl::sycl::float3, since latter is 16 bytes.
---
 src/gromacs/gpu_utils/devicebuffer_datatype.h |   2 +-
 src/gromacs/gpu_utils/devicebuffer_sycl.h     | 125 ++++++-
 src/gromacs/gpu_utils/gputraits_sycl.h        |   9 +
 src/gromacs/mdlib/CMakeLists.txt              |  11 +-
 src/gromacs/mdlib/leapfrog_gpu.cu             |   2 +-
 src/gromacs/mdlib/leapfrog_gpu.h              |  19 +-
 src/gromacs/mdlib/leapfrog_gpu_sycl.cpp       | 307 ++++++++++++++++++
 src/gromacs/mdlib/tests/leapfrogtestrunners.h |   2 +-
 .../mdlib/tests/leapfrogtestrunners_gpu.cpp   |   8 +
 9 files changed, 459 insertions(+), 26 deletions(-)
 create mode 100644 src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
diff --git a/src/gromacs/gpu_utils/devicebuffer_datatype.h b/src/gromacs/gpu_utils/devicebuffer_datatype.h
index b4be8bd619..d1c27bde9c 100644
--- a/src/gromacs/gpu_utils/devicebuffer_datatype.h
+++ b/src/gromacs/gpu_utils/devicebuffer_datatype.h
@@ -116,7 +116,7 @@ struct DeviceBuffer
     //! Allow implicit conversion to bool to check buffer status for compatibility with other implementations.
     operator bool() const { return buffer_.get() != nullptr; }
 
-    //! An assignment operator to allow resetting buffer by assigning nullptr to it. Necessery for compilation.
+    //! An assignment operator to allow resetting buffer by assigning nullptr to it. Necessary for compilation.
     DeviceBuffer& operator=(std::nullptr_t nullPtr);
 };
 
diff --git a/src/gromacs/gpu_utils/devicebuffer_sycl.h b/src/gromacs/gpu_utils/devicebuffer_sycl.h
index a75e238448..5efc4dd963 100644
--- a/src/gromacs/gpu_utils/devicebuffer_sycl.h
+++ b/src/gromacs/gpu_utils/devicebuffer_sycl.h
@@ -116,8 +116,108 @@ DeviceBuffer<T>& DeviceBuffer<T>::operator=(std::nullptr_t nullPtr)
     return *this;
 }
 
+
+namespace gmx::internal
+{
+//! Shorthand alias to create a placeholder SYCL accessor with chosen data type and access mode.
+template<class T, enum cl::sycl::access::mode mode>
+using PlaceholderAccessor =
+        cl::sycl::accessor<T, 1, mode, cl::sycl::access::target::global_buffer, cl::sycl::access::placeholder::true_t>;
+} // namespace gmx::internal
+
+/** \brief
+ * Thin wrapper around placeholder accessor that allows implicit construction from \c DeviceBuffer.
+ *
+ * "Placeholder accessor" is an indicator of the intent to create an accessor for certain buffer
+ * of a certain type, that is not yet bound to a specific command group handler (device). Such
+ * accessors can be created outside SYCL kernels, which is helpful if we want to pass them as
+ * function arguments.
+ *
+ * \tparam T Type of buffer content.
+ * \tparam mode Access mode.
+ */
+template<class T, enum cl::sycl::access::mode mode>
+class DeviceAccessor : public gmx::internal::PlaceholderAccessor<T, mode>
+{
+public:
+    // Inherit all the constructors
+    using gmx::internal::PlaceholderAccessor<T, mode>::PlaceholderAccessor;
+    //! Construct Accessor from DeviceBuffer (must be initialized)
+    DeviceAccessor(DeviceBuffer<T>& buffer) :
+        gmx::internal::PlaceholderAccessor<T, mode>(getSyclBuffer(buffer))
+    {
+    }
+
+private:
+    //! Helper function to get sycl:buffer object from DeviceBuffer wrapper, with a sanity check.
+    static inline cl::sycl::buffer<T, 1>& getSyclBuffer(DeviceBuffer<T>& buffer)
+    {
+        GMX_ASSERT(bool(buffer), "Trying to construct accessor from an uninitialized buffer");
+        return *buffer.buffer_;
+    }
+};
+
+namespace gmx::internal
+{
+//! A "blackhole" class to be used when we want to ignore an argument to a function.
+struct EmptyClassThatIgnoresConstructorArguments
+{
+    template<class... Args>
+    [[maybe_unused]] EmptyClassThatIgnoresConstructorArguments(Args&&... /*args*/)
+    {
+    }
+};
+} // namespace gmx::internal
+
+/** \brief
+ * Helper class to be used as function argument. Will either correspond to a device accessor, or an empty class.
+ *
+ * Example usage:
+ * \code
+    template <bool doFoo>
+    void getBarKernel(handler& cgh, OptionalAccessor<float, mode::read, doFoo> a_fooPrms)
+    {
+        if constexpr (doFoo)
+            cgh.require(a_fooPrms);
+        // Can only use a_fooPrms if doFoo == true
+    }
+
+    template <bool doFoo>
+    void callBar(DeviceBuffer<float> b_fooPrms)
+    {
+        // If doFoo is false, b_fooPrms will be ignored (can be not initialized).
+        // Otherwise, an accessor will be built (b_fooPrms must be a valid buffer).
+        auto kernel = getBarKernel<doFoo>(b_fooPrms);
+        // If the accessor in not enabled, anything can be passed as its ctor argument.
+        auto kernel2 = getBarKernel<false>(nullptr_t);
+    }
+ * \endcode
+ *
+ * \tparam T Data type of the underlying buffer
+ * \tparam mode Access mode of the accessor
+ * \tparam enabled Compile-time flag indicating whether we want to actually create an accessor.
+ */
+template<class T, enum cl::sycl::access::mode mode, bool enabled>
+using OptionalAccessor =
+        std::conditional_t<enabled, DeviceAccessor<T, mode>, gmx::internal::EmptyClassThatIgnoresConstructorArguments>;
+
 #endif // #ifndef DOXYGEN
 
+/*! \brief Check the validity of the device buffer.
+ *
+ * Checks if the buffer is valid and if its allocation is big enough.
+ *
+ * \param[in] buffer        Device buffer to be checked.
+ * \param[in] requiredSize  Number of elements that the buffer will have to accommodate.
+ *
+ * \returns Whether the device buffer exists and has enough capacity.
+ */
+template<typename T>
+static gmx_unused bool checkDeviceBuffer(const DeviceBuffer<T>& buffer, int requiredSize)
+{
+    return buffer.buffer_ && (static_cast<int>(buffer.buffer_->get_count()) >= requiredSize);
+}
+
 /*! \libinternal \brief
  * Allocates a device-side buffer.
  * It is currently a caller's responsibility to call it only on not-yet allocated buffers.
@@ -182,9 +282,11 @@ void copyToDeviceBuffer(DeviceBuffer<ValueType>* buffer,
         return; // such calls are actually made with empty domains
     }
     GMX_ASSERT(buffer, "needs a buffer pointer");
-    GMX_ASSERT(buffer->buffer_, "needs an initialized buffer pointer");
     GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
 
+    GMX_ASSERT(checkDeviceBuffer(*buffer, startingOffset + numValues),
+               "buffer too small or not initialized");
+
     cl::sycl::buffer<ValueType>& syclBuffer = *buffer->buffer_;
 
     cl::sycl::event ev = deviceStream.stream().submit([&](cl::sycl::handler& cgh) {
@@ -234,6 +336,9 @@ void copyFromDeviceBuffer(ValueType*               hostBuffer,
     GMX_ASSERT(buffer, "needs a buffer pointer");
     GMX_ASSERT(hostBuffer, "needs a host buffer pointer");
 
+    GMX_ASSERT(checkDeviceBuffer(*buffer, startingOffset + numValues),
+               "buffer too small or not initialized");
+
     cl::sycl::buffer<ValueType>& syclBuffer = *buffer->buffer_;
 
     cl::sycl::event ev = deviceStream.stream().submit([&](cl::sycl::handler& cgh) {
@@ -269,6 +374,9 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
     }
     GMX_ASSERT(buffer, "needs a buffer pointer");
 
+    GMX_ASSERT(checkDeviceBuffer(*buffer, startingOffset + numValues),
+               "buffer too small or not initialized");
+
     const ValueType              pattern{};
     cl::sycl::buffer<ValueType>& syclBuffer = *(buffer->buffer_);
 
@@ -280,21 +388,6 @@ void clearDeviceBufferAsync(DeviceBuffer<ValueType>* buffer,
     });
 }
 
-/*! \brief Check the validity of the device buffer.
- *
- * Checks if the buffer is valid and if its allocation is big enough.
- *
- * \param[in] buffer        Device buffer to be checked.
- * \param[in] requiredSize  Number of elements that the buffer will have to accommodate.
- *
- * \returns Whether the device buffer exists and has enough capacity.
- */
-template<typename T>
-static gmx_unused bool checkDeviceBuffer(DeviceBuffer<T> buffer, int requiredSize)
-{
-    return buffer.buffer_ && (static_cast<int>(buffer.buffer_->get_count()) >= requiredSize);
-}
-
 /*! \brief Create a texture object for an array of type ValueType.
  *
  * Creates the device buffer and copies read-only data for an array of type ValueType.
diff --git a/src/gromacs/gpu_utils/gputraits_sycl.h b/src/gromacs/gpu_utils/gputraits_sycl.h
index ab121c8858..52e40bb836 100644
--- a/src/gromacs/gpu_utils/gputraits_sycl.h
+++ b/src/gromacs/gpu_utils/gputraits_sycl.h
@@ -53,6 +53,15 @@ using DeviceTexture = void*;
 //! \brief Single GPU call timing event, not used with SYCL
 using CommandEvent = void*;
 
+//! Convenience alias.
+using float4 = cl::sycl::float4;
+
+//! Convenience alias. Not using cl::sycl::float3 due to alignment issues.
+using float3 = gmx::RVec;
+
+//! Convenience alias for cl::sycl::float2
+using float2 = cl::sycl::float2;
+
 /*! \internal \brief
  * GPU kernels scheduling description. This is same in OpenCL/CUDA.
  * Provides reasonable defaults, one typically only needs to set the GPU stream
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index 6cb02ee517..949a305797 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -34,6 +34,8 @@
 # the research papers on the package. Check out http://www.gromacs.org.
 
 file(GLOB MDLIB_SOURCES *.cpp)
+# To avoid listing all the necessary files manually, we will remove SYCL-specfific one here:
+list(REMOVE_ITEM MDLIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp)
 
 set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
 if (BUILD_TESTING)
@@ -48,4 +50,11 @@ if(GMX_GPU_CUDA)
        gpuforcereduction_impl.cu
        )
 endif()
-
+if(GMX_GPU_SYCL)
+    gmx_add_libgromacs_sources(
+        leapfrog_gpu_sycl.cpp
+    )
+    _gmx_add_files_to_property(SYCL_SOURCES
+        leapfrog_gpu_sycl.cpp
+    )
+endif()
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu.cu
index 1b5ec5f42b..1793a386b2 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu.cu
@@ -97,7 +97,6 @@ constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  * \param[in]     dt                               Timestep.
  * \param[in]     gm_lambdas                       Temperature scaling factors (one per group)
  * \param[in]     gm_tempScaleGroups               Mapping of atoms into groups.
- * \param[in]     dtPressureCouple                 Time step for pressure coupling
  * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
  */
 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
@@ -138,6 +137,7 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
         // Swapping places for xp and x so that the x will contain the updated coordinates and xp - the
         // coordinates before update. This should be taken into account when (if) constraints are applied
         // after the update: x and xp have to be passed to constraints in the 'wrong' order.
+        // TODO: Issue #3727
         gm_xp[threadIndex] = x;
 
         if (numTempScaleValues != NumTempScaleValues::None || velocityScaling != VelocityScalingType::None)
diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h
index 8bcf5e37a5..7c76805d09 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.h
+++ b/src/gromacs/mdlib/leapfrog_gpu.h
@@ -50,6 +50,10 @@
 #    include "gromacs/gpu_utils/devicebuffer.cuh"
 #    include "gromacs/gpu_utils/gputraits.cuh"
 #endif
+#if GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/devicebuffer_sycl.h"
+#    include "gromacs/gpu_utils/gputraits_sycl.h"
+#endif
 
 #include "gromacs/gpu_utils/hostallocator.h"
 #include "gromacs/pbcutil/pbc.h"
@@ -72,9 +76,10 @@ namespace gmx
  */
 enum class NumTempScaleValues
 {
-    None,    //!< No temperature coupling
-    Single,  //!< Single T-scaling value (one group)
-    Multiple //!< Multiple T-scaling values, need to use T-group indices
+    None     = 0, //!< No temperature coupling
+    Single   = 1, //!< Single T-scaling value (one group)
+    Multiple = 2, //!< Multiple T-scaling values, need to use T-group indices
+    Count    = 3  //!< Number of valid values
 };
 
 /*! \brief Different variants of the Parrinello-Rahman velocity scaling
@@ -84,8 +89,9 @@ enum class NumTempScaleValues
  */
 enum class VelocityScalingType
 {
-    None,     //!< Do not apply velocity scaling (not a PR-coupling run or step)
-    Diagonal, //!< Apply velocity scaling using a diagonal matrix
+    None     = 0, //!< Do not apply velocity scaling (not a PR-coupling run or step)
+    Diagonal = 1, //!< Apply velocity scaling using a diagonal matrix
+    Count    = 2  //!< Number of valid values
 };
 
 class LeapFrogGpu
@@ -166,7 +172,8 @@ private:
     //! Number of temperature coupling groups (zero = no coupling)
     int numTempScaleValues_ = 0;
     /*! \brief Array with temperature scaling factors.
-     * This is temporary solution to remap data from t_grp_tcstat into plain array
+     * This is temporary solution to remap data from t_grp_tcstat into plain array.
+     * Not used in SYCL.
      * \todo Replace with better solution.
      */
     gmx::HostVector<float> h_lambdas_;
diff --git a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
new file mode 100644
index 0000000000..61484f8968
--- /dev/null
+++ b/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
@@ -0,0 +1,307 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements Leap-Frog using SYCL
+ *
+ * This file contains implementation of basic Leap-Frog integrator
+ * using SYCL, including class initialization, data-structures management
+ * and GPU kernel.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/leapfrog_gpu.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/utility/arrayref.h"
+#include "gromacs/utility/fatalerror.h"
+#include "gromacs/utility/template_mp.h"
+
+namespace gmx
+{
+
+using cl::sycl::access::mode;
+
+/*! \brief Main kernel for the Leap-Frog integrator.
+ *
+ *  The coordinates and velocities are updated on the GPU. Also saves the intermediate values of the coordinates for
+ *   further use in constraints.
+ *
+ *  Each GPU thread works with a single particle.
+ *
+ * \tparam        numTempScaleValues               The number of different T-couple values.
+ * \tparam        velocityScaling                  Type of the Parrinello-Rahman velocity rescaling.
+ * \param         cgh                              SYCL's command group handler.
+ * \param[in,out] a_x                              Coordinates to update upon integration.
+ * \param[out]    a_xp                             A copy of the coordinates before the integration (for constraints).
+ * \param[in,out] a_v                              Velocities to update.
+ * \param[in]     a_f                              Atomic forces.
+ * \param[in]     a_inverseMasses                  Reciprocal masses.
+ * \param[in]     dt                               Timestep.
+ * \param[in]     a_lambdas                        Temperature scaling factors (one per group).
+ * \param[in]     a_tempScaleGroups                Mapping of atoms into groups.
+ * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
+ */
+template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
+auto leapFrogKernel(
+        cl::sycl::handler&                          cgh,
+        DeviceAccessor<float3, mode::read_write>    a_x,
+        DeviceAccessor<float3, mode::discard_write> a_xp,
+        DeviceAccessor<float3, mode::read_write>    a_v,
+        DeviceAccessor<float3, mode::read>          a_f,
+        DeviceAccessor<float, mode::read>           a_inverseMasses,
+        float                                       dt,
+        OptionalAccessor<float, mode::read, numTempScaleValues != NumTempScaleValues::None> a_lambdas,
+        OptionalAccessor<unsigned short, mode::read, numTempScaleValues == NumTempScaleValues::Multiple> a_tempScaleGroups,
+        float3 prVelocityScalingMatrixDiagonal)
+{
+    cgh.require(a_x);
+    cgh.require(a_xp);
+    cgh.require(a_v);
+    cgh.require(a_f);
+    cgh.require(a_inverseMasses);
+    if constexpr (numTempScaleValues != NumTempScaleValues::None)
+    {
+        cgh.require(a_lambdas);
+    }
+    if constexpr (numTempScaleValues == NumTempScaleValues::Multiple)
+    {
+        cgh.require(a_tempScaleGroups);
+    }
+
+    return [=](cl::sycl::id<1> itemIdx) {
+        const float3 x    = a_x[itemIdx];
+        const float3 v    = a_v[itemIdx];
+        const float3 f    = a_f[itemIdx];
+        const float  im   = a_inverseMasses[itemIdx];
+        const float  imdt = im * dt;
+
+        // Swapping places for xp and x so that the x will contain the updated coordinates and xp -
+        // the coordinates before update. This should be taken into account when (if) constraints
+        // are applied after the update: x and xp have to be passed to constraints in the 'wrong'
+        // order. See Issue #3727
+        a_xp[itemIdx] = x;
+
+        const float lambda = [=]() {
+            if constexpr (numTempScaleValues == NumTempScaleValues::None)
+            {
+                return 1.0F;
+            }
+            else if constexpr (numTempScaleValues == NumTempScaleValues::Single)
+            {
+                return a_lambdas[0];
+            }
+            else if constexpr (numTempScaleValues == NumTempScaleValues::Multiple)
+            {
+                const int tempScaleGroup = a_tempScaleGroups[itemIdx];
+                return a_lambdas[tempScaleGroup];
+            }
+        }();
+
+        const float3 prVelocityDelta = [=]() {
+            if constexpr (velocityScaling == VelocityScalingType::Diagonal)
+            {
+                return float3{ prVelocityScalingMatrixDiagonal[0] * v[0],
+                               prVelocityScalingMatrixDiagonal[1] * v[1],
+                               prVelocityScalingMatrixDiagonal[2] * v[2] };
+            }
+            else if constexpr (velocityScaling == VelocityScalingType::None)
+            {
+                return float3{ 0, 0, 0 };
+            }
+        }();
+
+        const float3 v_new = v * lambda - prVelocityDelta + f * imdt;
+        a_v[itemIdx]       = v_new;
+        a_x[itemIdx]       = x + v_new * dt;
+    };
+}
+
+// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
+template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
+class LeapFrogKernelName;
+
+template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling, class... Args>
+static cl::sycl::event launchLeapFrogKernel(const DeviceStream& deviceStream, int numAtoms, Args&&... args)
+{
+    // Should not be needed for SYCL2020.
+    using kernelNameType = LeapFrogKernelName<numTempScaleValues, velocityScaling>;
+
+    const cl::sycl::range<1> rangeAllAtoms(numAtoms);
+    cl::sycl::queue          q = deviceStream.stream();
+
+    cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
+        auto kernel =
+                leapFrogKernel<numTempScaleValues, velocityScaling>(cgh, std::forward<Args>(args)...);
+        cgh.parallel_for<kernelNameType>(rangeAllAtoms, kernel);
+    });
+
+    return e;
+}
+
+static NumTempScaleValues getTempScalingType(bool doTemperatureScaling, int numTempScaleValues)
+{
+    if (!doTemperatureScaling)
+    {
+        return NumTempScaleValues::None;
+    }
+    else if (numTempScaleValues == 1)
+    {
+        return NumTempScaleValues::Single;
+    }
+    else if (numTempScaleValues > 1)
+    {
+        return NumTempScaleValues::Multiple;
+    }
+    else
+    {
+        gmx_incons("Temperature coupling was requested with no temperature coupling groups.");
+    }
+}
+
+/*! \brief Select templated kernel and launch it. */
+template<class... Args>
+static inline cl::sycl::event launchLeapFrogKernel(NumTempScaleValues  tempScalingType,
+                                                   VelocityScalingType prVelocityScalingType,
+                                                   Args&&... args)
+{
+    GMX_ASSERT(prVelocityScalingType == VelocityScalingType::None
+                       || prVelocityScalingType == VelocityScalingType::Diagonal,
+               "Only isotropic Parrinello-Rahman pressure coupling is supported.");
+
+    return dispatchTemplatedFunction(
+            [&](auto tempScalingType_, auto prScalingType_) {
+                return launchLeapFrogKernel<tempScalingType_, prScalingType_>(std::forward<Args>(args)...);
+            },
+            tempScalingType, prVelocityScalingType);
+}
+
+void LeapFrogGpu::integrate(DeviceBuffer<float3>              d_x,
+                            DeviceBuffer<float3>              d_xp,
+                            DeviceBuffer<float3>              d_v,
+                            DeviceBuffer<float3>              d_f,
+                            const real                        dt,
+                            const bool                        doTemperatureScaling,
+                            gmx::ArrayRef<const t_grp_tcstat> tcstat,
+                            const bool                        doParrinelloRahman,
+                            const float                       dtPressureCouple,
+                            const matrix                      prVelocityScalingMatrix)
+{
+    if (doTemperatureScaling)
+    {
+        GMX_ASSERT(checkDeviceBuffer(d_lambdas_, numTempScaleValues_),
+                   "Number of temperature scaling factors changed since it was set for the "
+                   "last time.");
+        { // Explicitly limiting the scope of host accessor. Not strictly necessary here.
+            auto ha_lambdas_ = d_lambdas_.buffer_->get_access<mode::discard_write>();
+            for (int i = 0; i < numTempScaleValues_; i++)
+            {
+                ha_lambdas_[i] = tcstat[i].lambda;
+            }
+        }
+    }
+    NumTempScaleValues tempVelocityScalingType =
+            getTempScalingType(doTemperatureScaling, numTempScaleValues_);
+
+    VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
+    if (doParrinelloRahman)
+    {
+        prVelocityScalingType = VelocityScalingType::Diagonal;
+        GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
+                           && prVelocityScalingMatrix[ZZ][YY] == 0 && prVelocityScalingMatrix[XX][YY] == 0
+                           && prVelocityScalingMatrix[XX][ZZ] == 0 && prVelocityScalingMatrix[YY][ZZ] == 0,
+                   "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
+                   "in GPU version of Leap-Frog integrator.");
+        prVelocityScalingMatrixDiagonal_ =
+                dtPressureCouple
+                * float3{ prVelocityScalingMatrix[XX][XX], prVelocityScalingMatrix[YY][YY],
+                          prVelocityScalingMatrix[ZZ][ZZ] };
+    }
+
+    launchLeapFrogKernel(tempVelocityScalingType, prVelocityScalingType, deviceStream_, numAtoms_,
+                         d_x, d_xp, d_v, d_f, d_inverseMasses_, dt, d_lambdas_, d_tempScaleGroups_,
+                         prVelocityScalingMatrixDiagonal_);
+}
+
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+    deviceContext_(deviceContext),
+    deviceStream_(deviceStream),
+    numAtoms_(0)
+{
+}
+
+LeapFrogGpu::~LeapFrogGpu()
+{
+    freeDeviceBuffer(&d_inverseMasses_);
+}
+
+void LeapFrogGpu::set(const int             numAtoms,
+                      const real*           inverseMasses,
+                      const int             numTempScaleValues,
+                      const unsigned short* tempScaleGroups)
+{
+    numAtoms_           = numAtoms;
+    numTempScaleValues_ = numTempScaleValues;
+
+    reallocateDeviceBuffer(&d_inverseMasses_, numAtoms_, &numInverseMasses_,
+                           &numInverseMassesAlloc_, deviceContext_);
+    copyToDeviceBuffer(&d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_,
+                       GpuApiCallBehavior::Sync, nullptr);
+
+    // Temperature scale group map only used if there are more then one group
+    if (numTempScaleValues_ > 1)
+    {
+        reallocateDeviceBuffer(&d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_,
+                               &numTempScaleGroupsAlloc_, deviceContext_);
+        copyToDeviceBuffer(&d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_,
+                           GpuApiCallBehavior::Sync, nullptr);
+    }
+
+    // If the temperature coupling is enabled, we need to make space for scaling factors
+    if (numTempScaleValues_ > 0)
+    {
+        reallocateDeviceBuffer(&d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_,
+                               deviceContext_);
+    }
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners.h b/src/gromacs/mdlib/tests/leapfrogtestrunners.h
index 599cc63517..7590aa2dfa 100644
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners.h
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners.h
@@ -54,7 +54,7 @@
 
 #include "leapfrogtestdata.h"
 
-#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA)
+#define HAVE_GPU_LEAPFROG (GMX_GPU_CUDA || GMX_GPU_SYCL)
 
 namespace gmx
 {
diff --git a/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp b/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp
index 5783b66332..1e102035d6 100644
--- a/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp
+++ b/src/gromacs/mdlib/tests/leapfrogtestrunners_gpu.cpp
@@ -50,6 +50,12 @@
 
 #if GMX_GPU_CUDA
 #    include "gromacs/gpu_utils/devicebuffer.cuh"
+#endif
+#if GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/devicebuffer_sycl.h"
+#endif
+
+#if HAVE_GPU_LEAPFROG
 #    include "gromacs/mdlib/leapfrog_gpu.h"
 #endif
 
@@ -70,6 +76,8 @@ void LeapFrogDeviceTestRunner::integrate(LeapFrogTestData* testData, int numStep
 
     int numAtoms = testData->numAtoms_;
 
+    static_assert(sizeof(float3) == sizeof(*testData->x_.data()), "Incompatible types");
+
     float3* h_x  = reinterpret_cast<float3*>(testData->x_.data());
     float3* h_xp = reinterpret_cast<float3*>(testData->xPrime_.data());
     float3* h_v  = reinterpret_cast<float3*>(testData->v_.data());
-- 
2.22.0