From 4e90f9ebb3bf6ad109715859cbdfda1d70976bd3 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Fri, 23 Apr 2021 11:36:31 +0000
Subject: [PATCH] Make SETTLE setup code platform-agnostic

---
 src/gromacs/mdlib/CMakeLists.txt              |  13 +-
 src/gromacs/mdlib/settle_gpu.cpp              | 263 +++++++++++++++++
 .../mdlib/{settle_gpu.cuh => settle_gpu.h}    |  28 +-
 .../{settle_gpu.cu => settle_gpu_internal.cu} | 264 ++++--------------
 src/gromacs/mdlib/settle_gpu_internal.h       |  91 ++++++
 .../mdlib/settle_gpu_internal_sycl.cpp        |  70 +++++
 src/gromacs/mdlib/tests/settletestrunners.cu  |   2 +-
 .../mdlib/update_constrain_gpu_impl.cu        |   2 -
 src/gromacs/mdlib/update_constrain_gpu_impl.h |   2 +-
 9 files changed, 504 insertions(+), 231 deletions(-)
 create mode 100644 src/gromacs/mdlib/settle_gpu.cpp
 rename src/gromacs/mdlib/{settle_gpu.cuh => settle_gpu.h} (91%)
 rename src/gromacs/mdlib/{settle_gpu.cu => settle_gpu_internal.cu} (64%)
 create mode 100644 src/gromacs/mdlib/settle_gpu_internal.h
 create mode 100644 src/gromacs/mdlib/settle_gpu_internal_sycl.cpp

diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index 623fcb31df..3f905dedcf 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -40,7 +40,9 @@ file(GLOB MDLIB_SOURCES *.cpp)
 list(REMOVE_ITEM MDLIB_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu_internal_sycl.cpp)
 
 set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
 if(GMX_GPU_CUDA)
@@ -48,12 +50,14 @@ if(GMX_GPU_CUDA)
        leapfrog_gpu.cu
        lincs_gpu.cpp
        lincs_gpu_internal.cu
-       settle_gpu.cu
+       settle_gpu.cpp
+       settle_gpu_internal.cu
        update_constrain_gpu_impl.cu
        gpuforcereduction_impl.cu
        )
     _gmx_add_files_to_property(CUDA_SOURCES
        lincs_gpu.cpp
+       settle_gpu.cpp
        )
 endif()
 
@@ -62,11 +66,16 @@ if(GMX_GPU_SYCL)
         leapfrog_gpu_sycl.cpp
         lincs_gpu.cpp
         lincs_gpu_internal_sycl.cpp
+        settle_gpu.cpp
+        settle_gpu_internal_sycl.cpp
     )
+
     _gmx_add_files_to_property(SYCL_SOURCES
         leapfrog_gpu_sycl.cpp
         lincs_gpu.cpp
         lincs_gpu_internal_sycl.cpp
+        settle_gpu.cpp
+        settle_gpu_internal_sycl.cpp
     )
 endif()
 
diff --git a/src/gromacs/mdlib/settle_gpu.cpp b/src/gromacs/mdlib/settle_gpu.cpp
new file mode 100644
index 0000000000..0c19d7ad99
--- /dev/null
+++ b/src/gromacs/mdlib/settle_gpu.cpp
@@ -0,0 +1,263 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements SETTLE using GPU
+ *
+ * This file contains implementation for the data management of GPU version of SETTLE constraints algorithm.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "settle_gpu.h"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <cmath>
+
+#include <algorithm>
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/math/vec.h"
+#include "gromacs/mdlib/settle_gpu_internal.h"
+#include "gromacs/pbcutil/pbc.h"
+
+namespace gmx
+{
+
+void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
+                      DeviceBuffer<Float3>       d_xp,
+                      const bool                 updateVelocities,
+                      DeviceBuffer<Float3>       d_v,
+                      const real                 invdt,
+                      const bool                 computeVirial,
+                      tensor                     virialScaled,
+                      const PbcAiuc              pbcAiuc)
+{
+
+    // Early exit if no settles
+    if (numSettles_ == 0)
+    {
+        return;
+    }
+
+    if (computeVirial)
+    {
+        // Fill with zeros so the values can be reduced to it
+        // Only 6 values are needed because virial is symmetrical
+        clearDeviceBufferAsync(&d_virialScaled_, 0, 6, deviceStream_);
+    }
+
+    launchSettleGpuKernel(numSettles_,
+                          d_atomIds_,
+                          settleParameters_,
+                          d_x,
+                          d_xp,
+                          updateVelocities,
+                          d_v,
+                          invdt,
+                          computeVirial,
+                          d_virialScaled_,
+                          pbcAiuc,
+                          deviceStream_);
+
+
+    if (computeVirial)
+    {
+        copyFromDeviceBuffer(
+                h_virialScaled_.data(), &d_virialScaled_, 0, 6, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
+
+        // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
+        virialScaled[XX][XX] += h_virialScaled_[0];
+        virialScaled[XX][YY] += h_virialScaled_[1];
+        virialScaled[XX][ZZ] += h_virialScaled_[2];
+
+        virialScaled[YY][XX] += h_virialScaled_[1];
+        virialScaled[YY][YY] += h_virialScaled_[3];
+        virialScaled[YY][ZZ] += h_virialScaled_[4];
+
+        virialScaled[ZZ][XX] += h_virialScaled_[2];
+        virialScaled[ZZ][YY] += h_virialScaled_[4];
+        virialScaled[ZZ][ZZ] += h_virialScaled_[5];
+    }
+
+    return;
+}
+
+SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
+    deviceContext_(deviceContext),
+    deviceStream_(deviceStream)
+{
+    static_assert(sizeof(real) == sizeof(float),
+                  "Real numbers should be in single precision in GPU code.");
+
+    // This is to prevent the assertion failure for the systems without water
+    int totalSettles = 0;
+    for (unsigned mt = 0; mt < mtop.moltype.size(); mt++)
+    {
+        const int        nral1           = 1 + NRAL(F_SETTLE);
+        InteractionList  interactionList = mtop.moltype[mt].ilist[F_SETTLE];
+        std::vector<int> iatoms          = interactionList.iatoms;
+        totalSettles += iatoms.size() / nral1;
+    }
+    if (totalSettles == 0)
+    {
+        return;
+    }
+
+    // TODO This should be lifted to a separate subroutine that gets the values of Oxygen and
+    // Hydrogen masses, checks if they are consistent across the topology and if there is no more
+    // than two values for each mass if the free energy perturbation is enabled. In later case,
+    // masses may need to be updated on a regular basis (i.e. in set(...) method).
+    // TODO Do the checks for FEP
+    real mO = -1.0;
+    real mH = -1.0;
+
+    for (unsigned mt = 0; mt < mtop.moltype.size(); mt++)
+    {
+        const int        nral1           = 1 + NRAL(F_SETTLE);
+        InteractionList  interactionList = mtop.moltype[mt].ilist[F_SETTLE];
+        std::vector<int> iatoms          = interactionList.iatoms;
+        for (unsigned i = 0; i < iatoms.size() / nral1; i++)
+        {
+            WaterMolecule settler;
+            settler.ow1 = iatoms[i * nral1 + 1]; // Oxygen index
+            settler.hw2 = iatoms[i * nral1 + 2]; // First hydrogen index
+            settler.hw3 = iatoms[i * nral1 + 3]; // Second hydrogen index
+            t_atom ow1  = mtop.moltype[mt].atoms.atom[settler.ow1];
+            t_atom hw2  = mtop.moltype[mt].atoms.atom[settler.hw2];
+            t_atom hw3  = mtop.moltype[mt].atoms.atom[settler.hw3];
+
+            if (mO < 0)
+            {
+                mO = ow1.m;
+            }
+            if (mH < 0)
+            {
+                mH = hw2.m;
+            }
+            GMX_RELEASE_ASSERT(mO == ow1.m,
+                               "Topology has different values for oxygen mass. Should be identical "
+                               "in order to use SETTLE.");
+            GMX_RELEASE_ASSERT(hw2.m == hw3.m && hw2.m == mH,
+                               "Topology has different values for hydrogen mass. Should be "
+                               "identical in order to use SETTLE.");
+        }
+    }
+
+    GMX_RELEASE_ASSERT(mO > 0, "Could not find oxygen mass in the topology. Needed in SETTLE.");
+    GMX_RELEASE_ASSERT(mH > 0, "Could not find hydrogen mass in the topology. Needed in SETTLE.");
+
+    // TODO Very similar to SETTLE initialization on CPU. Should be lifted to a separate method
+    // (one that gets dOH and dHH values and checks them for consistency)
+    int settle_type = -1;
+    for (unsigned mt = 0; mt < mtop.moltype.size(); mt++)
+    {
+        const int       nral1           = 1 + NRAL(F_SETTLE);
+        InteractionList interactionList = mtop.moltype[mt].ilist[F_SETTLE];
+        for (int i = 0; i < interactionList.size(); i += nral1)
+        {
+            if (settle_type == -1)
+            {
+                settle_type = interactionList.iatoms[i];
+            }
+            else if (interactionList.iatoms[i] != settle_type)
+            {
+                gmx_fatal(FARGS,
+                          "The [molecules] section of your topology specifies more than one block "
+                          "of\n"
+                          "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
+                          "If you are trying to partition your solvent into different *groups*\n"
+                          "(e.g. for freezing, T-coupling, etc.), you are using the wrong "
+                          "approach. Index\n"
+                          "files specify groups. Otherwise, you may wish to change the least-used\n"
+                          "block of molecules with SETTLE constraints into 3 normal constraints.");
+            }
+        }
+    }
+
+    GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
+
+    real dOH = mtop.ffparams.iparams[settle_type].settle.doh;
+    real dHH = mtop.ffparams.iparams[settle_type].settle.dhh;
+
+    settleParameters_ = settleParameters(mO, mH, 1.0 / mO, 1.0 / mH, dOH, dHH);
+
+    allocateDeviceBuffer(&d_virialScaled_, 6, deviceContext_);
+    h_virialScaled_.resize(6);
+}
+
+SettleGpu::~SettleGpu()
+{
+    // Early exit if there is no settles
+    if (numSettles_ == 0)
+    {
+        return;
+    }
+    freeDeviceBuffer(&d_virialScaled_);
+    if (numAtomIdsAlloc_ > 0)
+    {
+        freeDeviceBuffer(&d_atomIds_);
+    }
+}
+
+void SettleGpu::set(const InteractionDefinitions& idef)
+{
+    const int              nral1     = 1 + NRAL(F_SETTLE);
+    const InteractionList& il_settle = idef.il[F_SETTLE];
+    ArrayRef<const int>    iatoms    = il_settle.iatoms;
+    numSettles_                      = il_settle.size() / nral1;
+
+    reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, deviceContext_);
+    h_atomIds_.resize(numSettles_);
+    for (int i = 0; i < numSettles_; i++)
+    {
+        WaterMolecule settler;
+        settler.ow1   = iatoms[i * nral1 + 1]; // Oxygen index
+        settler.hw2   = iatoms[i * nral1 + 2]; // First hydrogen index
+        settler.hw3   = iatoms[i * nral1 + 3]; // Second hydrogen index
+        h_atomIds_[i] = settler;
+    }
+    copyToDeviceBuffer(
+            &d_atomIds_, h_atomIds_.data(), 0, numSettles_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/settle_gpu.cuh b/src/gromacs/mdlib/settle_gpu.h
similarity index 91%
rename from src/gromacs/mdlib/settle_gpu.cuh
rename to src/gromacs/mdlib/settle_gpu.h
index f09fbd344f..641c53d95a 100644
--- a/src/gromacs/mdlib/settle_gpu.cuh
+++ b/src/gromacs/mdlib/settle_gpu.h
@@ -40,8 +40,8 @@
  *
  * \ingroup module_mdlib
  */
-#ifndef GMX_MDLIB_SETTLE_GPU_CUH
-#define GMX_MDLIB_SETTLE_GPU_CUH
+#ifndef GMX_MDLIB_SETTLE_GPU_H
+#define GMX_MDLIB_SETTLE_GPU_H
 
 #include "gmxpre.h"
 
@@ -62,6 +62,18 @@ class InteractionDefinitions;
 namespace gmx
 {
 
+//! Indices of atoms in a water molecule
+struct WaterMolecule
+{
+    //! Oxygen atom
+    int ow1;
+    //! First hydrogen atom
+    int hw2;
+    //! Second hydrogen atom
+    int hw3;
+};
+
+
 /*! \internal \brief Class with interfaces and data for GPU version of SETTLE. */
 class SettleGpu
 {
@@ -134,15 +146,15 @@ private:
     //! Scaled virial tensor (9 reals, GPU)
     std::vector<float> h_virialScaled_;
     //! Scaled virial tensor (9 reals, GPU)
-    float* d_virialScaled_;
+    DeviceBuffer<float> d_virialScaled_;
 
     //! Number of settles
     int numSettles_ = 0;
 
-    //! Indexes of atoms (.x for oxygen, .y and.z for hydrogens, CPU)
-    std::vector<int3> h_atomIds_;
-    //! Indexes of atoms (.x for oxygen, .y and.z for hydrogens, GPU)
-    int3* d_atomIds_;
+    //! Indexes of atoms (.i for oxygen, .j and.k for hydrogens, CPU)
+    std::vector<WaterMolecule> h_atomIds_;
+    //! Indexes of atoms (.i for oxygen, .j and.k for hydrogens, GPU)
+    DeviceBuffer<WaterMolecule> d_atomIds_;
     //! Current size of the array of atom IDs
     int numAtomIds_ = -1;
     //! Allocated size for the array of atom IDs
@@ -154,4 +166,4 @@ private:
 
 } // namespace gmx
 
-#endif // GMX_MDLIB_SETTLE_GPU_CUH
+#endif // GMX_MDLIB_SETTLE_GPU_H
diff --git a/src/gromacs/mdlib/settle_gpu.cu b/src/gromacs/mdlib/settle_gpu_internal.cu
similarity index 64%
rename from src/gromacs/mdlib/settle_gpu.cu
rename to src/gromacs/mdlib/settle_gpu_internal.cu
index 0cc25c50ce..11dd63b035 100644
--- a/src/gromacs/mdlib/settle_gpu.cu
+++ b/src/gromacs/mdlib/settle_gpu_internal.cu
@@ -34,11 +34,7 @@
  */
 /*! \internal \file
  *
- * \brief Implements SETTLE using CUDA
- *
- * This file contains implementation of SETTLE constraints algorithm
- * using CUDA, including class initialization, data-structures management
- * and GPU kernel.
+ * \brief CUDA-specific routines for the GPU implementation of SETTLE constraints algorithm.
  *
  *
  * \author Artem Zhmurov <zhmurov@gmail.com>
@@ -47,7 +43,7 @@
  */
 #include "gmxpre.h"
 
-#include "settle_gpu.cuh"
+#include "settle_gpu_internal.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -71,9 +67,10 @@ namespace gmx
 {
 
 //! Number of CUDA threads in a block
-constexpr static int c_threadsPerBlock = 256;
+constexpr static int sc_threadsPerBlock = 256;
+
 //! Maximum number of threads in a block (for __launch_bounds__)
-constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+constexpr static int sc_maxThreadsPerBlock = sc_threadsPerBlock;
 
 /*! \brief SETTLE constraints kernel
  *
@@ -94,9 +91,9 @@ constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
  * \param [in]      pbcAiuc          Periodic boundary conditions data.
  */
 template<bool updateVelocities, bool computeVirial>
-__launch_bounds__(c_maxThreadsPerBlock) __global__
+__launch_bounds__(sc_maxThreadsPerBlock) __global__
         void settle_kernel(const int numSettles,
-                           const int3* __restrict__ gm_settles,
+                           const WaterMolecule* __restrict__ gm_settles,
                            const SettleParameters pars,
                            const float3* __restrict__ gm_x,
                            float3* __restrict__ gm_xprime,
@@ -131,15 +128,15 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
     {
         // These are the indexes of three atoms in a single 'water' molecule.
         // TODO Can be reduced to one integer if atoms are consecutive in memory.
-        int3 indices = gm_settles[tid];
+        WaterMolecule indices = gm_settles[tid];
 
-        float3 x_ow1 = gm_x[indices.x];
-        float3 x_hw2 = gm_x[indices.y];
-        float3 x_hw3 = gm_x[indices.z];
+        float3 x_ow1 = gm_x[indices.ow1];
+        float3 x_hw2 = gm_x[indices.hw2];
+        float3 x_hw3 = gm_x[indices.hw3];
 
-        float3 xprime_ow1 = gm_xprime[indices.x];
-        float3 xprime_hw2 = gm_xprime[indices.y];
-        float3 xprime_hw3 = gm_xprime[indices.z];
+        float3 xprime_ow1 = gm_xprime[indices.ow1];
+        float3 xprime_hw2 = gm_xprime[indices.hw2];
+        float3 xprime_hw3 = gm_xprime[indices.hw3];
 
         float3 dist21 = pbcDxAiuc(pbcAiuc, x_hw2, x_ow1);
         float3 dist31 = pbcDxAiuc(pbcAiuc, x_hw3, x_ow1);
@@ -274,24 +271,24 @@ __launch_bounds__(c_maxThreadsPerBlock) __global__
         const float3 dxHw2 = b3 - b1;
         const float3 dxHw3 = c3 - c1;
 
-        gm_xprime[indices.x] = xprime_ow1 + dxOw1;
-        gm_xprime[indices.y] = xprime_hw2 + dxHw2;
-        gm_xprime[indices.z] = xprime_hw3 + dxHw3;
+        gm_xprime[indices.ow1] = xprime_ow1 + dxOw1;
+        gm_xprime[indices.hw2] = xprime_hw2 + dxHw2;
+        gm_xprime[indices.hw3] = xprime_hw3 + dxHw3;
 
         if (updateVelocities)
         {
-            float3 v_ow1 = gm_v[indices.x];
-            float3 v_hw2 = gm_v[indices.y];
-            float3 v_hw3 = gm_v[indices.z];
+            float3 v_ow1 = gm_v[indices.ow1];
+            float3 v_hw2 = gm_v[indices.hw2];
+            float3 v_hw3 = gm_v[indices.hw3];
 
             /* Add the position correction divided by dt to the velocity */
             v_ow1 = dxOw1 * invdt + v_ow1;
             v_hw2 = dxHw2 * invdt + v_hw2;
             v_hw3 = dxHw3 * invdt + v_hw3;
 
-            gm_v[indices.x] = v_ow1;
-            gm_v[indices.y] = v_hw2;
-            gm_v[indices.z] = v_hw3;
+            gm_v[indices.ow1] = v_ow1;
+            gm_v[indices.hw2] = v_hw2;
+            gm_v[indices.hw3] = v_hw3;
         }
 
         if (computeVirial)
@@ -397,44 +394,37 @@ inline auto getSettleKernelPtr(const bool updateVelocities, const bool computeVi
     return kernelPtr;
 }
 
-void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
-                      DeviceBuffer<Float3>       d_xp,
-                      const bool                 updateVelocities,
-                      DeviceBuffer<Float3>       d_v,
-                      const real                 invdt,
-                      const bool                 computeVirial,
-                      tensor                     virialScaled,
-                      const PbcAiuc              pbcAiuc)
+void launchSettleGpuKernel(const int                         numSettles,
+                           const DeviceBuffer<WaterMolecule> d_atomIds,
+                           const SettleParameters            settleParameters,
+                           const DeviceBuffer<Float3>        d_x,
+                           DeviceBuffer<Float3>              d_xp,
+                           const bool                        updateVelocities,
+                           DeviceBuffer<Float3>              d_v,
+                           const real                        invdt,
+                           const bool                        computeVirial,
+                           DeviceBuffer<float>               virialScaled,
+                           const PbcAiuc                     pbcAiuc,
+                           const DeviceStream&               deviceStream)
 {
-
-    ensureNoPendingDeviceError("In CUDA version SETTLE");
-
-    // Early exit if no settles
-    if (numSettles_ == 0)
-    {
-        return;
-    }
-
-    if (computeVirial)
-    {
-        // Fill with zeros so the values can be reduced to it
-        // Only 6 values are needed because virial is symmetrical
-        clearDeviceBufferAsync(&d_virialScaled_, 0, 6, deviceStream_);
-    }
+    static_assert(
+            gmx::isPowerOfTwo(sc_threadsPerBlock),
+            "Number of threads per block should be a power of two in order for reduction to work.");
 
     auto kernelPtr = getSettleKernelPtr(updateVelocities, computeVirial);
 
     KernelLaunchConfig config;
-    config.blockSize[0] = c_threadsPerBlock;
+    config.blockSize[0] = sc_threadsPerBlock;
     config.blockSize[1] = 1;
     config.blockSize[2] = 1;
-    config.gridSize[0]  = (numSettles_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
+    config.gridSize[0]  = (numSettles + sc_threadsPerBlock - 1) / sc_threadsPerBlock;
     config.gridSize[1]  = 1;
     config.gridSize[2]  = 1;
+
     // Shared memory is only used for virial reduction
     if (computeVirial)
     {
-        config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
+        config.sharedMemorySize = sc_threadsPerBlock * 6 * sizeof(float);
     }
     else
     {
@@ -443,184 +433,24 @@ void SettleGpu::apply(const DeviceBuffer<Float3> d_x,
 
     const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
                                                       config,
-                                                      &numSettles_,
-                                                      &d_atomIds_,
-                                                      &settleParameters_,
+                                                      &numSettles,
+                                                      &d_atomIds,
+                                                      &settleParameters,
                                                       asFloat3Pointer(&d_x),
                                                       asFloat3Pointer(&d_xp),
                                                       &invdt,
                                                       asFloat3Pointer(&d_v),
-                                                      &d_virialScaled_,
+                                                      &virialScaled,
                                                       &pbcAiuc);
 
     launchGpuKernel(kernelPtr,
                     config,
-                    deviceStream_,
+                    deviceStream,
                     nullptr,
                     "settle_kernel<updateVelocities, computeVirial>",
                     kernelArgs);
 
-    if (computeVirial)
-    {
-        copyFromDeviceBuffer(
-                h_virialScaled_.data(), &d_virialScaled_, 0, 6, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-
-        // Mapping [XX, XY, XZ, YY, YZ, ZZ] internal format to a tensor object
-        virialScaled[XX][XX] += h_virialScaled_[0];
-        virialScaled[XX][YY] += h_virialScaled_[1];
-        virialScaled[XX][ZZ] += h_virialScaled_[2];
-
-        virialScaled[YY][XX] += h_virialScaled_[1];
-        virialScaled[YY][YY] += h_virialScaled_[3];
-        virialScaled[YY][ZZ] += h_virialScaled_[4];
-
-        virialScaled[ZZ][XX] += h_virialScaled_[2];
-        virialScaled[ZZ][YY] += h_virialScaled_[4];
-        virialScaled[ZZ][ZZ] += h_virialScaled_[5];
-    }
-
     return;
 }
 
-SettleGpu::SettleGpu(const gmx_mtop_t& mtop, const DeviceContext& deviceContext, const DeviceStream& deviceStream) :
-    deviceContext_(deviceContext),
-    deviceStream_(deviceStream)
-{
-    static_assert(sizeof(real) == sizeof(float),
-                  "Real numbers should be in single precision in GPU code.");
-    static_assert(
-            gmx::isPowerOfTwo(c_threadsPerBlock),
-            "Number of threads per block should be a power of two in order for reduction to work.");
-
-    // This is to prevent the assertion failure for the systems without water
-    int totalSettles = 0;
-    for (unsigned mt = 0; mt < mtop.moltype.size(); mt++)
-    {
-        const int        nral1           = 1 + NRAL(F_SETTLE);
-        InteractionList  interactionList = mtop.moltype.at(mt).ilist[F_SETTLE];
-        std::vector<int> iatoms          = interactionList.iatoms;
-        totalSettles += iatoms.size() / nral1;
-    }
-    if (totalSettles == 0)
-    {
-        return;
-    }
-
-    // TODO This should be lifted to a separate subroutine that gets the values of Oxygen and
-    // Hydrogen masses, checks if they are consistent across the topology and if there is no more
-    // than two values for each mass if the free energy perturbation is enabled. In later case,
-    // masses may need to be updated on a regular basis (i.e. in set(...) method).
-    // TODO Do the checks for FEP
-    real mO = -1.0;
-    real mH = -1.0;
-
-    for (unsigned mt = 0; mt < mtop.moltype.size(); mt++)
-    {
-        const int        nral1           = 1 + NRAL(F_SETTLE);
-        InteractionList  interactionList = mtop.moltype.at(mt).ilist[F_SETTLE];
-        std::vector<int> iatoms          = interactionList.iatoms;
-        for (unsigned i = 0; i < iatoms.size() / nral1; i++)
-        {
-            int3 settler;
-            settler.x  = iatoms[i * nral1 + 1]; // Oxygen index
-            settler.y  = iatoms[i * nral1 + 2]; // First hydrogen index
-            settler.z  = iatoms[i * nral1 + 3]; // Second hydrogen index
-            t_atom ow1 = mtop.moltype.at(mt).atoms.atom[settler.x];
-            t_atom hw2 = mtop.moltype.at(mt).atoms.atom[settler.y];
-            t_atom hw3 = mtop.moltype.at(mt).atoms.atom[settler.z];
-
-            if (mO < 0)
-            {
-                mO = ow1.m;
-            }
-            if (mH < 0)
-            {
-                mH = hw2.m;
-            }
-            GMX_RELEASE_ASSERT(mO == ow1.m,
-                               "Topology has different values for oxygen mass. Should be identical "
-                               "in order to use SETTLE.");
-            GMX_RELEASE_ASSERT(hw2.m == hw3.m && hw2.m == mH,
-                               "Topology has different values for hydrogen mass. Should be "
-                               "identical in order to use SETTLE.");
-        }
-    }
-
-    GMX_RELEASE_ASSERT(mO > 0, "Could not find oxygen mass in the topology. Needed in SETTLE.");
-    GMX_RELEASE_ASSERT(mH > 0, "Could not find hydrogen mass in the topology. Needed in SETTLE.");
-
-    // TODO Very similar to SETTLE initialization on CPU. Should be lifted to a separate method
-    // (one that gets dOH and dHH values and checks them for consistency)
-    int settle_type = -1;
-    for (unsigned mt = 0; mt < mtop.moltype.size(); mt++)
-    {
-        const int       nral1           = 1 + NRAL(F_SETTLE);
-        InteractionList interactionList = mtop.moltype.at(mt).ilist[F_SETTLE];
-        for (int i = 0; i < interactionList.size(); i += nral1)
-        {
-            if (settle_type == -1)
-            {
-                settle_type = interactionList.iatoms[i];
-            }
-            else if (interactionList.iatoms[i] != settle_type)
-            {
-                gmx_fatal(FARGS,
-                          "The [molecules] section of your topology specifies more than one block "
-                          "of\n"
-                          "a [moleculetype] with a [settles] block. Only one such is allowed.\n"
-                          "If you are trying to partition your solvent into different *groups*\n"
-                          "(e.g. for freezing, T-coupling, etc.), you are using the wrong "
-                          "approach. Index\n"
-                          "files specify groups. Otherwise, you may wish to change the least-used\n"
-                          "block of molecules with SETTLE constraints into 3 normal constraints.");
-            }
-        }
-    }
-
-    GMX_RELEASE_ASSERT(settle_type >= 0, "settle_init called without settles");
-
-    real dOH = mtop.ffparams.iparams[settle_type].settle.doh;
-    real dHH = mtop.ffparams.iparams[settle_type].settle.dhh;
-
-    settleParameters_ = settleParameters(mO, mH, 1.0 / mO, 1.0 / mH, dOH, dHH);
-
-    allocateDeviceBuffer(&d_virialScaled_, 6, deviceContext_);
-    h_virialScaled_.resize(6);
-}
-
-SettleGpu::~SettleGpu()
-{
-    // Early exit if there is no settles
-    if (numSettles_ == 0)
-    {
-        return;
-    }
-    freeDeviceBuffer(&d_virialScaled_);
-    if (numAtomIdsAlloc_ > 0)
-    {
-        freeDeviceBuffer(&d_atomIds_);
-    }
-}
-
-void SettleGpu::set(const InteractionDefinitions& idef)
-{
-    const int              nral1     = 1 + NRAL(F_SETTLE);
-    const InteractionList& il_settle = idef.il[F_SETTLE];
-    ArrayRef<const int>    iatoms    = il_settle.iatoms;
-    numSettles_                      = il_settle.size() / nral1;
-
-    reallocateDeviceBuffer(&d_atomIds_, numSettles_, &numAtomIds_, &numAtomIdsAlloc_, deviceContext_);
-    h_atomIds_.resize(numSettles_);
-    for (int i = 0; i < numSettles_; i++)
-    {
-        int3 settler;
-        settler.x        = iatoms[i * nral1 + 1]; // Oxygen index
-        settler.y        = iatoms[i * nral1 + 2]; // First hydrogen index
-        settler.z        = iatoms[i * nral1 + 3]; // Second hydrogen index
-        h_atomIds_.at(i) = settler;
-    }
-    copyToDeviceBuffer(
-            &d_atomIds_, h_atomIds_.data(), 0, numSettles_, deviceStream_, GpuApiCallBehavior::Sync, nullptr);
-}
-
 } // namespace gmx
diff --git a/src/gromacs/mdlib/settle_gpu_internal.h b/src/gromacs/mdlib/settle_gpu_internal.h
new file mode 100644
index 0000000000..9963944e74
--- /dev/null
+++ b/src/gromacs/mdlib/settle_gpu_internal.h
@@ -0,0 +1,91 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares backend-specific functions for GPU implementation of SETTLE.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_SETTLE_GPU_INTERNAL_H
+#define GMX_MDLIB_SETTLE_GPU_INTERNAL_H
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/mdlib/settle_gpu.h"
+
+namespace gmx
+{
+
+/*! \brief Apply SETTLE.
+ *
+ * Applies SETTLE to coordinates and velocities, stored on GPU. Data at pointers d_xp and
+ * d_v change in the GPU memory. The results are not automatically copied back to the CPU
+ * memory. Method uses this class data structures which should be updated when needed using
+ * update method.
+ *
+ * \param[in]     numSettles        Number of SETTLE constraints.
+ * \param[in]     d_atomIds         Device buffer with indices of atoms to be SETTLEd.
+ * \param[in]     settleParameters  Parameters for SETTLE constraints.
+ * \param[in]     d_x               Coordinates before timestep (in GPU memory)
+ * \param[in,out] d_xp              Coordinates after timestep (in GPU memory). The
+ *                                  resulting constrained coordinates will be saved here.
+ * \param[in]     updateVelocities  If the velocities should be updated.
+ * \param[in,out] d_v               Velocities to update (in GPU memory, can be nullptr
+ *                                  if not updated)
+ * \param[in]     invdt             Reciprocal timestep (to scale Lagrange
+ *                                  multipliers when velocities are updated)
+ * \param[in]     computeVirial     If virial should be updated.
+ * \param[in,out] virialScaled      Scaled virial tensor to be updated.
+ * \param[in]     pbcAiuc           PBC data.
+ * \param[in]     deviceStream      Device stream to launch kernel in.
+ */
+void launchSettleGpuKernel(int                               numSettles,
+                           const DeviceBuffer<WaterMolecule> d_atomIds,
+                           const SettleParameters            settleParameters,
+                           const DeviceBuffer<Float3>        d_x,
+                           DeviceBuffer<Float3>              d_xp,
+                           const bool                        updateVelocities,
+                           DeviceBuffer<Float3>              d_v,
+                           const real                        invdt,
+                           const bool                        computeVirial,
+                           DeviceBuffer<float>               virialScaled,
+                           const PbcAiuc                     pbcAiuc,
+                           const DeviceStream&               deviceStream);
+
+} // namespace gmx
+
+#endif // GMX_MDLIB_SETTLE_GPU_INTERNAL_H
diff --git a/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
new file mode 100644
index 0000000000..ef063c36d5
--- /dev/null
+++ b/src/gromacs/mdlib/settle_gpu_internal_sycl.cpp
@@ -0,0 +1,70 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief SYCL-specific routines for the GPU implementation of SETTLE constraints algorithm.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+
+#include "settle_gpu_internal.h"
+
+#include "gromacs/utility/gmxassert.h"
+
+namespace gmx
+{
+
+void launchSettleGpuKernel(const int /* numSettles */,
+                           const DeviceBuffer<WaterMolecule> /* d_atomIds */,
+                           const SettleParameters /* settleParameters */,
+                           const DeviceBuffer<Float3> /* d_x */,
+                           DeviceBuffer<Float3> /* d_xp */,
+                           const bool /* updateVelocities */,
+                           DeviceBuffer<Float3> /* d_v */,
+                           const real /* invdt */,
+                           const bool /* computeVirial */,
+                           DeviceBuffer<float> /* virialScaled */,
+                           const PbcAiuc /* pbcAiuc */,
+                           const DeviceStream& /* deviceStream */)
+{
+    // SYCL_TODO
+    GMX_RELEASE_ASSERT(false, "SETTLE is not yet implemented in SYCL.");
+
+    return;
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/tests/settletestrunners.cu b/src/gromacs/mdlib/tests/settletestrunners.cu
index 7aab750547..b35cf0be9b 100644
--- a/src/gromacs/mdlib/tests/settletestrunners.cu
+++ b/src/gromacs/mdlib/tests/settletestrunners.cu
@@ -54,7 +54,7 @@
 #include "gromacs/gpu_utils/devicebuffer.cuh"
 #include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/hardware/device_information.h"
-#include "gromacs/mdlib/settle_gpu.cuh"
+#include "gromacs/mdlib/settle_gpu.h"
 #include "gromacs/utility/unique_cptr.h"
 
 #include "testutils/test_device.h"
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index ba3a1d12af..7ef37c0940 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -64,8 +64,6 @@
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/mdlib/leapfrog_gpu.h"
-#include "gromacs/mdlib/lincs_gpu.h"
-#include "gromacs/mdlib/settle_gpu.cuh"
 #include "gromacs/mdlib/update_constrain_gpu.h"
 #include "gromacs/mdtypes/mdatom.h"
 #include "gromacs/timing/wallcycle.h"
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index dcca5dc2a9..bad3898602 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -50,7 +50,7 @@
 
 #include "gromacs/mdlib/leapfrog_gpu.h"
 #include "gromacs/mdlib/lincs_gpu.h"
-#include "gromacs/mdlib/settle_gpu.cuh"
+#include "gromacs/mdlib/settle_gpu.h"
 #include "gromacs/mdlib/update_constrain_gpu.h"
 #include "gromacs/mdtypes/inputrec.h"
 
-- 
2.22.0