From 47370908b87106c4ef7ea8b99df3f86998c3e0fc Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Wed, 19 May 2021 10:53:55 +0000
Subject: [PATCH] Make common Leap-Frog device code backend-agnostic

---
 src/gromacs/mdlib/CMakeLists.txt              |  13 +-
 src/gromacs/mdlib/leapfrog_gpu.cpp            | 175 ++++++++++++++++++
 src/gromacs/mdlib/leapfrog_gpu.h              |   5 +-
 ...apfrog_gpu.cu => leapfrog_gpu_internal.cu} | 168 ++++-------------
 src/gromacs/mdlib/leapfrog_gpu_internal.h     |  71 +++++++
 ...ycl.cpp => leapfrog_gpu_internal_sycl.cpp} | 124 +++----------
 6 files changed, 316 insertions(+), 240 deletions(-)
 create mode 100644 src/gromacs/mdlib/leapfrog_gpu.cpp
 rename src/gromacs/mdlib/{leapfrog_gpu.cu => leapfrog_gpu_internal.cu} (62%)
 create mode 100644 src/gromacs/mdlib/leapfrog_gpu_internal.h
 rename src/gromacs/mdlib/{leapfrog_gpu_sycl.cpp => leapfrog_gpu_internal_sycl.cpp} (65%)

diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index 2e645f0362..543f879212 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -40,7 +40,8 @@ file(GLOB MDLIB_SOURCES *.cpp)
 list(REMOVE_ITEM MDLIB_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/gpuforcereduction_impl_internal_sycl.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_internal_sycl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu.cpp
@@ -51,9 +52,10 @@ list(REMOVE_ITEM MDLIB_SOURCES
 set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
 if(GMX_GPU_CUDA)
     gmx_add_libgromacs_sources(
-       leapfrog_gpu.cu
        gpuforcereduction_impl.cpp
        gpuforcereduction_impl_internal.cu
+       leapfrog_gpu.cpp
+       leapfrog_gpu_internal.cu
        lincs_gpu.cpp
        lincs_gpu_internal.cu
        settle_gpu.cpp
@@ -63,6 +65,7 @@ if(GMX_GPU_CUDA)
        )
     _gmx_add_files_to_property(CUDA_SOURCES
        gpuforcereduction_impl.cpp
+       leapfrog_gpu.cpp
        lincs_gpu.cpp
        settle_gpu.cpp
        update_constrain_gpu_impl.cpp
@@ -73,7 +76,8 @@ if(GMX_GPU_SYCL)
     gmx_add_libgromacs_sources(
         gpuforcereduction_impl.cpp
         gpuforcereduction_impl_internal_sycl.cpp
-        leapfrog_gpu_sycl.cpp
+        leapfrog_gpu.cpp
+        leapfrog_gpu_internal_sycl.cpp
         lincs_gpu.cpp
         lincs_gpu_internal_sycl.cpp
         settle_gpu.cpp
@@ -85,7 +89,8 @@ if(GMX_GPU_SYCL)
     _gmx_add_files_to_property(SYCL_SOURCES
         gpuforcereduction_impl.cpp
         gpuforcereduction_impl_internal_sycl.cpp
-        leapfrog_gpu_sycl.cpp
+        leapfrog_gpu.cpp
+        leapfrog_gpu_internal_sycl.cpp
         lincs_gpu.cpp
         lincs_gpu_internal_sycl.cpp
         settle_gpu.cpp
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cpp b/src/gromacs/mdlib/leapfrog_gpu.cpp
new file mode 100644
index 0000000000..b07c167c7c
--- /dev/null
+++ b/src/gromacs/mdlib/leapfrog_gpu.cpp
@@ -0,0 +1,175 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements Leap-Frog using CUDA
+ *
+ * This file contains backend-agnostic code for Leap-Frog integrator class on GPU,
+ * including class initialization, and data-structures management.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "leapfrog_gpu.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/leapfrog_gpu_internal.h"
+#include "gromacs/mdtypes/group.h"
+#include "gromacs/pbcutil/pbc.h"
+#include "gromacs/utility/arrayref.h"
+
+namespace gmx
+{
+
+void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
+                            DeviceBuffer<Float3>              d_xp,
+                            DeviceBuffer<Float3>              d_v,
+                            const DeviceBuffer<Float3>        d_f,
+                            const float                       dt,
+                            const bool                        doTemperatureScaling,
+                            gmx::ArrayRef<const t_grp_tcstat> tcstat,
+                            const bool                        doParrinelloRahman,
+                            const float                       dtPressureCouple,
+                            const matrix                      prVelocityScalingMatrix)
+{
+
+    if (doTemperatureScaling)
+    {
+        GMX_ASSERT(checkDeviceBuffer(d_lambdas_, numTempScaleValues_),
+                   "Number of temperature scaling factors changed since it was set for the "
+                   "last time.");
+        GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
+                   "Number of temperature scaling factors changed since it was set for the "
+                   "last time.");
+
+        /* In SYCL, we could use host accessors here, without h_lambdas_.
+         * According to a quick test, host accessor is slightly faster when using DPC++ and
+         * LevelZero compared to using h_lambdas_ + cgh.copy. But with DPC++ and OpenCL, the host
+         * accessor waits for fReadyOnDevice in UpdateConstrainGpu::Impl::integrate. See #4023. */
+        for (int i = 0; i < numTempScaleValues_; i++)
+        {
+            h_lambdas_[i] = tcstat[i].lambda;
+        }
+        copyToDeviceBuffer(&d_lambdas_,
+                           h_lambdas_.data(),
+                           0,
+                           numTempScaleValues_,
+                           deviceStream_,
+                           GpuApiCallBehavior::Async,
+                           nullptr);
+    }
+    VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
+    if (doParrinelloRahman)
+    {
+        prVelocityScalingType = VelocityScalingType::Diagonal;
+        GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
+                           && prVelocityScalingMatrix[ZZ][YY] == 0 && prVelocityScalingMatrix[XX][YY] == 0
+                           && prVelocityScalingMatrix[XX][ZZ] == 0 && prVelocityScalingMatrix[YY][ZZ] == 0,
+                   "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
+                   "in GPU version of Leap-Frog integrator.");
+        prVelocityScalingMatrixDiagonal_ = Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX],
+                                                   dtPressureCouple * prVelocityScalingMatrix[YY][YY],
+                                                   dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] };
+    }
+
+    launchLeapFrogKernel(numAtoms_,
+                         d_x,
+                         d_xp,
+                         d_v,
+                         d_f,
+                         d_inverseMasses_,
+                         dt,
+                         doTemperatureScaling,
+                         numTempScaleValues_,
+                         d_tempScaleGroups_,
+                         d_lambdas_,
+                         prVelocityScalingType,
+                         prVelocityScalingMatrixDiagonal_,
+                         deviceStream_);
+}
+
+LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext,
+                         const DeviceStream&  deviceStream,
+                         const int            numTempScaleValues) :
+    deviceContext_(deviceContext), deviceStream_(deviceStream), numTempScaleValues_(numTempScaleValues)
+{
+    numAtoms_ = 0;
+
+    changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported);
+
+    // If the temperature coupling is enabled, we need to make space for scaling factors
+    if (numTempScaleValues_ > 0)
+    {
+        h_lambdas_.resize(numTempScaleValues_);
+        reallocateDeviceBuffer(
+                &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
+    }
+}
+
+LeapFrogGpu::~LeapFrogGpu()
+{
+    freeDeviceBuffer(&d_inverseMasses_);
+}
+
+void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups)
+{
+    numAtoms_ = numAtoms;
+
+    reallocateDeviceBuffer(
+            &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
+    copyToDeviceBuffer(
+            &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
+
+    // Temperature scale group map only used if there are more then one group
+    if (numTempScaleValues_ > 1)
+    {
+        reallocateDeviceBuffer(
+                &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
+        copyToDeviceBuffer(
+                &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
+    }
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/leapfrog_gpu.h b/src/gromacs/mdlib/leapfrog_gpu.h
index 9102e2a27e..0f48770178 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.h
+++ b/src/gromacs/mdlib/leapfrog_gpu.h
@@ -125,7 +125,7 @@ public:
                    DeviceBuffer<Float3>              d_xp,
                    DeviceBuffer<Float3>              d_v,
                    const DeviceBuffer<Float3>        d_f,
-                   const real                        dt,
+                   const float                       dt,
                    const bool                        doTemperatureScaling,
                    gmx::ArrayRef<const t_grp_tcstat> tcstat,
                    const bool                        doParrinelloRahman,
@@ -152,8 +152,7 @@ private:
     const DeviceContext& deviceContext_;
     //! GPU stream
     const DeviceStream& deviceStream_;
-    //! GPU kernel launch config
-    KernelLaunchConfig kernelLaunchConfig_;
+
     //! Number of atoms
     int numAtoms_;
 
diff --git a/src/gromacs/mdlib/leapfrog_gpu.cu b/src/gromacs/mdlib/leapfrog_gpu_internal.cu
similarity index 62%
rename from src/gromacs/mdlib/leapfrog_gpu.cu
rename to src/gromacs/mdlib/leapfrog_gpu_internal.cu
index 0ce1174b33..e3f9bc7b82 100644
--- a/src/gromacs/mdlib/leapfrog_gpu.cu
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal.cu
@@ -36,9 +36,7 @@
  *
  * \brief Implements Leap-Frog using CUDA
  *
- * This file contains implementation of basic Leap-Frog integrator
- * using CUDA, including class initialization, data-structures management
- * and GPU kernel.
+ * This file contains CUDA implementation of back-end specific code for Leap-Frog.
  *
  * \author Artem Zhmurov <zhmurov@gmail.com>
  *
@@ -46,14 +44,7 @@
  */
 #include "gmxpre.h"
 
-#include "leapfrog_gpu.h"
-
-#include <assert.h>
-#include <stdio.h>
-
-#include <cmath>
-
-#include <algorithm>
+#include "leapfrog_gpu_internal.h"
 
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/devicebuffer.h"
@@ -101,19 +92,6 @@ constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  * \param[in]     prVelocityScalingMatrixDiagonal  Diagonal elements of Parrinello-Rahman velocity scaling matrix
  */
 template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
-__launch_bounds__(c_maxThreadsPerBlock) __global__
-        void leapfrog_kernel(const int numAtoms,
-                             float3* __restrict__ gm_x,
-                             float3* __restrict__ gm_xp,
-                             float3* __restrict__ gm_v,
-                             const float3* __restrict__ gm_f,
-                             const float* __restrict__ gm_inverseMasses,
-                             const float dt,
-                             const float* __restrict__ gm_lambdas,
-                             const unsigned short* __restrict__ gm_tempScaleGroups,
-                             const float3 prVelocityScalingMatrixDiagonal);
-
-template<NumTempScaleValues numTempScaleValues, VelocityScalingType velocityScaling>
 __launch_bounds__(c_maxThreadsPerBlock) __global__
         void leapfrog_kernel(const int numAtoms,
                              float3* __restrict__ gm_x,
@@ -176,7 +154,6 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
         gm_v[threadIndex] = v;
         gm_x[threadIndex] = x;
     }
-    return;
 }
 
 /*! \brief Select templated kernel.
@@ -238,124 +215,49 @@ inline auto selectLeapFrogKernelPtr(bool                doTemperatureScaling,
     return kernelPtr;
 }
 
-void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
-                            DeviceBuffer<Float3>              d_xp,
-                            DeviceBuffer<Float3>              d_v,
-                            const DeviceBuffer<Float3>        d_f,
-                            const real                        dt,
-                            const bool                        doTemperatureScaling,
-                            gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                            const bool                        doParrinelloRahman,
-                            const float                       dtPressureCouple,
-                            const matrix                      prVelocityScalingMatrix)
+
+void launchLeapFrogKernel(const int                          numAtoms,
+                          DeviceBuffer<Float3>               d_x,
+                          DeviceBuffer<Float3>               d_xp,
+                          DeviceBuffer<Float3>               d_v,
+                          const DeviceBuffer<Float3>         d_f,
+                          const DeviceBuffer<float>          d_inverseMasses,
+                          const float                        dt,
+                          const bool                         doTemperatureScaling,
+                          const int                          numTempScaleValues,
+                          const DeviceBuffer<unsigned short> d_tempScaleGroups,
+                          const DeviceBuffer<float>          d_lambdas,
+                          const VelocityScalingType          prVelocityScalingType,
+                          const Float3                       prVelocityScalingMatrixDiagonal,
+                          const DeviceStream&                deviceStream)
 {
+    // Checking the buffer types against the kernel argument types
+    static_assert(sizeof(*d_inverseMasses) == sizeof(float), "Incompatible types");
 
-    ensureNoPendingDeviceError("In CUDA version of Leap-Frog integrator");
+    KernelLaunchConfig kernelLaunchConfig;
 
-    auto kernelPtr = leapfrog_kernel<NumTempScaleValues::None, VelocityScalingType::None>;
-    if (doTemperatureScaling || doParrinelloRahman)
-    {
-        if (doTemperatureScaling)
-        {
-            GMX_ASSERT(numTempScaleValues_ == ssize(h_lambdas_),
-                       "Number of temperature scaling factors changed since it was set for the "
-                       "last time.");
-            for (int i = 0; i < numTempScaleValues_; i++)
-            {
-                h_lambdas_[i] = tcstat[i].lambda;
-            }
-            copyToDeviceBuffer(&d_lambdas_,
-                               h_lambdas_.data(),
-                               0,
-                               numTempScaleValues_,
-                               deviceStream_,
-                               GpuApiCallBehavior::Async,
-                               nullptr);
-        }
-        VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
-        if (doParrinelloRahman)
-        {
-            prVelocityScalingType = VelocityScalingType::Diagonal;
-            GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
-                               && prVelocityScalingMatrix[ZZ][YY] == 0
-                               && prVelocityScalingMatrix[XX][YY] == 0
-                               && prVelocityScalingMatrix[XX][ZZ] == 0
-                               && prVelocityScalingMatrix[YY][ZZ] == 0,
-                       "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
-                       "in GPU version of Leap-Frog integrator.");
-            prVelocityScalingMatrixDiagonal_ =
-                    Float3{ dtPressureCouple * prVelocityScalingMatrix[XX][XX],
-                            dtPressureCouple * prVelocityScalingMatrix[YY][YY],
-                            dtPressureCouple * prVelocityScalingMatrix[ZZ][ZZ] };
-        }
-        kernelPtr = selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues_, prVelocityScalingType);
-    }
+    kernelLaunchConfig.gridSize[0]      = (numAtoms + c_threadsPerBlock - 1) / c_threadsPerBlock;
+    kernelLaunchConfig.blockSize[0]     = c_threadsPerBlock;
+    kernelLaunchConfig.blockSize[1]     = 1;
+    kernelLaunchConfig.blockSize[2]     = 1;
+    kernelLaunchConfig.sharedMemorySize = 0;
+
+    auto kernelPtr =
+            selectLeapFrogKernelPtr(doTemperatureScaling, numTempScaleValues, prVelocityScalingType);
 
-    // Checking the buffer types against the kernel argument types
-    static_assert(sizeof(*d_inverseMasses_) == sizeof(float), "Incompatible types");
     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
-                                                      kernelLaunchConfig_,
-                                                      &numAtoms_,
+                                                      kernelLaunchConfig,
+                                                      &numAtoms,
                                                       asFloat3Pointer(&d_x),
                                                       asFloat3Pointer(&d_xp),
                                                       asFloat3Pointer(&d_v),
                                                       asFloat3Pointer(&d_f),
-                                                      &d_inverseMasses_,
+                                                      &d_inverseMasses,
                                                       &dt,
-                                                      &d_lambdas_,
-                                                      &d_tempScaleGroups_,
-                                                      &prVelocityScalingMatrixDiagonal_);
-    launchGpuKernel(kernelPtr, kernelLaunchConfig_, deviceStream_, nullptr, "leapfrog_kernel", kernelArgs);
-
-    return;
-}
-
-LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext,
-                         const DeviceStream&  deviceStream,
-                         const int            numTempScaleValues) :
-    deviceContext_(deviceContext), deviceStream_(deviceStream), numTempScaleValues_(numTempScaleValues)
-{
-    numAtoms_ = 0;
-
-    changePinningPolicy(&h_lambdas_, gmx::PinningPolicy::PinnedIfSupported);
-
-    kernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
-    kernelLaunchConfig_.blockSize[1]     = 1;
-    kernelLaunchConfig_.blockSize[2]     = 1;
-    kernelLaunchConfig_.sharedMemorySize = 0;
-
-    // If the temperature coupling is enabled, we need to make space for scaling factors
-    if (numTempScaleValues_ > 0)
-    {
-        h_lambdas_.resize(numTempScaleValues_);
-        reallocateDeviceBuffer(
-                &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
-    }
-}
-
-LeapFrogGpu::~LeapFrogGpu()
-{
-    freeDeviceBuffer(&d_inverseMasses_);
-}
-
-void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups)
-{
-    numAtoms_                       = numAtoms;
-    kernelLaunchConfig_.gridSize[0] = (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
-
-    reallocateDeviceBuffer(
-            &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
-    copyToDeviceBuffer(
-            &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-
-    // Temperature scale group map only used if there are more then one group
-    if (numTempScaleValues_ > 1)
-    {
-        reallocateDeviceBuffer(
-                &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
-        copyToDeviceBuffer(
-                &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-    }
+                                                      &d_lambdas,
+                                                      &d_tempScaleGroups,
+                                                      &prVelocityScalingMatrixDiagonal);
+    launchGpuKernel(kernelPtr, kernelLaunchConfig, deviceStream, nullptr, "leapfrog_kernel", kernelArgs);
 }
 
 } // namespace gmx
diff --git a/src/gromacs/mdlib/leapfrog_gpu_internal.h b/src/gromacs/mdlib/leapfrog_gpu_internal.h
new file mode 100644
index 0000000000..76330fe180
--- /dev/null
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal.h
@@ -0,0 +1,71 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *
+ * \brief Declarations for backend specific GPU functions for Leap-Frog.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ * \inlibraryapi
+ */
+#ifndef GMX_MDLIB_LEAPFROG_GPU_INTERNAL_H
+#define GMX_MDLIB_LEAPFROG_GPU_INTERNAL_H
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/mdlib/leapfrog_gpu.h"
+
+namespace gmx
+{
+
+void launchLeapFrogKernel(int                          numAtoms,
+                          DeviceBuffer<Float3>         d_x,
+                          DeviceBuffer<Float3>         d_xp,
+                          DeviceBuffer<Float3>         d_v,
+                          DeviceBuffer<Float3>         d_f,
+                          DeviceBuffer<float>          d_inverseMasses,
+                          float                        dt,
+                          bool                         doTemperatureScaling,
+                          int                          numTempScaleValues,
+                          DeviceBuffer<unsigned short> d_tempScaleGroups,
+                          DeviceBuffer<float>          d_lambdas,
+                          VelocityScalingType          prVelocityScalingType,
+                          Float3                       prVelocityScalingMatrixDiagonal,
+                          const DeviceStream&          deviceStream);
+
+
+} // namespace gmx
+
+#endif
diff --git a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
similarity index 65%
rename from src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
rename to src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
index 89997a69ca..566c92b1b4 100644
--- a/src/gromacs/mdlib/leapfrog_gpu_sycl.cpp
+++ b/src/gromacs/mdlib/leapfrog_gpu_internal_sycl.cpp
@@ -36,9 +36,7 @@
  *
  * \brief Implements Leap-Frog using SYCL
  *
- * This file contains implementation of basic Leap-Frog integrator
- * using SYCL, including class initialization, data-structures management
- * and GPU kernel.
+ * This file contains SYCL implementation of back-end specific code for Leap-Frog.
  *
  * \author Artem Zhmurov <zhmurov@gmail.com>
  * \author Andrey Alekseenko <al42and@gmail.com>
@@ -47,6 +45,8 @@
  */
 #include "gmxpre.h"
 
+#include "leapfrog_gpu_internal.h"
+
 #include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gmxsycl.h"
 #include "gromacs/math/vec.h"
@@ -216,114 +216,38 @@ static inline cl::sycl::event launchLeapFrogKernel(NumTempScaleValues  tempScali
             prVelocityScalingType);
 }
 
-void LeapFrogGpu::integrate(DeviceBuffer<Float3>              d_x,
-                            DeviceBuffer<Float3>              d_xp,
-                            DeviceBuffer<Float3>              d_v,
-                            DeviceBuffer<Float3>              d_f,
-                            const real                        dt,
-                            const bool                        doTemperatureScaling,
-                            gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                            const bool                        doParrinelloRahman,
-                            const float                       dtPressureCouple,
-                            const matrix                      prVelocityScalingMatrix)
+void launchLeapFrogKernel(int                                numAtoms,
+                          DeviceBuffer<Float3>               d_x,
+                          DeviceBuffer<Float3>               d_xp,
+                          DeviceBuffer<Float3>               d_v,
+                          const DeviceBuffer<Float3>         d_f,
+                          const DeviceBuffer<float>          d_inverseMasses,
+                          const float                        dt,
+                          const bool                         doTemperatureScaling,
+                          const int                          numTempScaleValues,
+                          const DeviceBuffer<unsigned short> d_tempScaleGroups,
+                          const DeviceBuffer<float>          d_lambdas,
+                          const VelocityScalingType          prVelocityScalingType,
+                          const Float3                       prVelocityScalingMatrixDiagonal,
+                          const DeviceStream&                deviceStream)
 {
-    if (doTemperatureScaling)
-    {
-        GMX_ASSERT(checkDeviceBuffer(d_lambdas_, numTempScaleValues_),
-                   "Number of temperature scaling factors changed since it was set for the "
-                   "last time.");
-        GMX_RELEASE_ASSERT(gmx::ssize(h_lambdas_) == numTempScaleValues_,
-                           "Number of temperature scaling factors changed since it was set for the "
-                           "last time.");
-        /* We could use host accessors here, without h_lambdas_.
-         * According to a quick test, host accessor is slightly faster when using DPC++ and
-         * LevelZero compared to using h_lambdas_ + cgh.copy. But with DPC++ and OpenCL, the host
-         * accessor waits for fReadyOnDevice in UpdateConstrainGpu::Impl::integrate. See #4023. */
-
-        for (int i = 0; i < numTempScaleValues_; i++)
-        {
-            h_lambdas_[i] = tcstat[i].lambda;
-        }
-        copyToDeviceBuffer(&d_lambdas_,
-                           h_lambdas_.data(),
-                           0,
-                           numTempScaleValues_,
-                           deviceStream_,
-                           GpuApiCallBehavior::Async,
-                           nullptr);
-    }
     NumTempScaleValues tempVelocityScalingType =
-            getTempScalingType(doTemperatureScaling, numTempScaleValues_);
+            getTempScalingType(doTemperatureScaling, numTempScaleValues);
 
-    VelocityScalingType prVelocityScalingType = VelocityScalingType::None;
-    if (doParrinelloRahman)
-    {
-        prVelocityScalingType = VelocityScalingType::Diagonal;
-        GMX_ASSERT(prVelocityScalingMatrix[YY][XX] == 0 && prVelocityScalingMatrix[ZZ][XX] == 0
-                           && prVelocityScalingMatrix[ZZ][YY] == 0 && prVelocityScalingMatrix[XX][YY] == 0
-                           && prVelocityScalingMatrix[XX][ZZ] == 0 && prVelocityScalingMatrix[YY][ZZ] == 0,
-                   "Fully anisotropic Parrinello-Rahman pressure coupling is not yet supported "
-                   "in GPU version of Leap-Frog integrator.");
-        prVelocityScalingMatrixDiagonal_ = dtPressureCouple
-                                           * Float3{ prVelocityScalingMatrix[XX][XX],
-                                                     prVelocityScalingMatrix[YY][YY],
-                                                     prVelocityScalingMatrix[ZZ][ZZ] };
-    }
 
     launchLeapFrogKernel(tempVelocityScalingType,
                          prVelocityScalingType,
-                         deviceStream_,
-                         numAtoms_,
+                         deviceStream,
+                         numAtoms,
                          d_x,
                          d_xp,
                          d_v,
                          d_f,
-                         d_inverseMasses_,
+                         d_inverseMasses,
                          dt,
-                         d_lambdas_,
-                         d_tempScaleGroups_,
-                         prVelocityScalingMatrixDiagonal_);
-}
-
-LeapFrogGpu::LeapFrogGpu(const DeviceContext& deviceContext,
-                         const DeviceStream&  deviceStream,
-                         const int            numTempScaleValues) :
-    deviceContext_(deviceContext),
-    deviceStream_(deviceStream),
-    numAtoms_(0),
-    numTempScaleValues_(numTempScaleValues)
-{
-    // If the temperature coupling is enabled, we need to make space for scaling factors
-    if (numTempScaleValues_ > 0)
-    {
-        h_lambdas_.resize(numTempScaleValues_);
-        reallocateDeviceBuffer(
-                &d_lambdas_, numTempScaleValues_, &numLambdas_, &numLambdasAlloc_, deviceContext_);
-    }
-}
-
-LeapFrogGpu::~LeapFrogGpu()
-{
-    freeDeviceBuffer(&d_inverseMasses_);
-}
-
-void LeapFrogGpu::set(const int numAtoms, const real* inverseMasses, const unsigned short* tempScaleGroups)
-{
-    numAtoms_ = numAtoms;
-
-    reallocateDeviceBuffer(
-            &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
-    copyToDeviceBuffer(
-            &d_inverseMasses_, inverseMasses, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-
-    // Temperature scale group map only used if there are more then one group
-    if (numTempScaleValues_ > 1)
-    {
-        reallocateDeviceBuffer(
-                &d_tempScaleGroups_, numAtoms_, &numTempScaleGroups_, &numTempScaleGroupsAlloc_, deviceContext_);
-        copyToDeviceBuffer(
-                &d_tempScaleGroups_, tempScaleGroups, 0, numAtoms_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-    }
+                         d_lambdas,
+                         d_tempScaleGroups,
+                         prVelocityScalingMatrixDiagonal);
 }
 
 } // namespace gmx
-- 
2.22.0