From aac166c9ccb2c386c6017cd56664ad43f759b0b5 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Tue, 11 May 2021 11:25:26 +0000
Subject: [PATCH] Make common Update-Constraints code backend-agnostic

---
 src/gromacs/mdlib/CMakeLists.txt              |  12 +-
 .../mdlib/update_constrain_gpu_impl.cpp       | 262 +++++++++++---
 .../mdlib/update_constrain_gpu_impl.cu        | 339 ------------------
 src/gromacs/mdlib/update_constrain_gpu_impl.h |  13 -
 .../mdlib/update_constrain_gpu_impl_stubs.cpp | 131 +++++++
 .../mdlib/update_constrain_gpu_internal.cu    | 105 ++++++
 .../mdlib/update_constrain_gpu_internal.h     |  84 +++++
 .../update_constrain_gpu_internal_sycl.cpp    |  61 ++++
 8 files changed, 600 insertions(+), 407 deletions(-)
 delete mode 100644 src/gromacs/mdlib/update_constrain_gpu_impl.cu
 create mode 100644 src/gromacs/mdlib/update_constrain_gpu_impl_stubs.cpp
 create mode 100644 src/gromacs/mdlib/update_constrain_gpu_internal.cu
 create mode 100644 src/gromacs/mdlib/update_constrain_gpu_internal.h
 create mode 100644 src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index 511a946809..2e645f0362 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -44,7 +44,9 @@ list(REMOVE_ITEM MDLIB_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu_internal_sycl.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/settle_gpu_internal_sycl.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/update_constrain_gpu_impl.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/update_constrain_gpu_internal_sycl.cpp)
 
 set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
 if(GMX_GPU_CUDA)
@@ -56,12 +58,14 @@ if(GMX_GPU_CUDA)
        lincs_gpu_internal.cu
        settle_gpu.cpp
        settle_gpu_internal.cu
-       update_constrain_gpu_impl.cu
+       update_constrain_gpu_impl.cpp
+       update_constrain_gpu_internal.cu
        )
     _gmx_add_files_to_property(CUDA_SOURCES
        gpuforcereduction_impl.cpp
        lincs_gpu.cpp
        settle_gpu.cpp
+       update_constrain_gpu_impl.cpp
        )
 endif()
 
@@ -74,6 +78,8 @@ if(GMX_GPU_SYCL)
         lincs_gpu_internal_sycl.cpp
         settle_gpu.cpp
         settle_gpu_internal_sycl.cpp
+        update_constrain_gpu_impl.cpp
+        update_constrain_gpu_internal_sycl.cpp
     )
 
     _gmx_add_files_to_property(SYCL_SOURCES
@@ -84,6 +90,8 @@ if(GMX_GPU_SYCL)
         lincs_gpu_internal_sycl.cpp
         settle_gpu.cpp
         settle_gpu_internal_sycl.cpp
+        update_constrain_gpu_impl.cpp
+        update_constrain_gpu_internal_sycl.cpp
     )
 endif()
 
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
index b0f3872860..df6420d378 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cpp
@@ -34,7 +34,12 @@
  */
 /*! \internal \file
  *
- * \brief Stub for update and constraints class CPU implementation.
+ * \brief Implements update and constraints class.
+ *
+ * The class combines Leap-Frog integrator with LINCS and SETTLE constraints.
+ *
+ * \todo The computational procedures in members should be integrated to improve
+ *       computational performance.
  *
  * \author Artem Zhmurov <zhmurov@gmail.com>
  *
@@ -42,90 +47,241 @@
  */
 #include "gmxpre.h"
 
-#include "config.h"
+#include "update_constrain_gpu_impl.h"
 
-#include "gromacs/mdlib/update_constrain_gpu.h"
-#include "gromacs/utility/gmxassert.h"
+#include <assert.h>
+#include <stdio.h>
+
+#include <cmath>
 
-#if !GMX_GPU_CUDA
+#include <algorithm>
+
+#include "gromacs/gpu_utils/device_context.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/devicebuffer.h"
+#if GMX_GPU_CUDA
+#    include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
+#elif GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
+#endif
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/mdlib/leapfrog_gpu.h"
+#include "gromacs/mdlib/update_constrain_gpu.h"
+#include "gromacs/mdlib/update_constrain_gpu_internal.h"
+#include "gromacs/mdtypes/mdatom.h"
+#include "gromacs/timing/wallcycle.h"
 
 namespace gmx
 {
 
-class UpdateConstrainGpu::Impl
+void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fReadyOnDevice,
+                                         const real                        dt,
+                                         const bool                        updateVelocities,
+                                         const bool                        computeVirial,
+                                         tensor                            virial,
+                                         const bool                        doTemperatureScaling,
+                                         gmx::ArrayRef<const t_grp_tcstat> tcstat,
+                                         const bool                        doParrinelloRahman,
+                                         const float                       dtPressureCouple,
+                                         const matrix                      prVelocityScalingMatrix)
 {
-};
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+
+    // Clearing virial matrix
+    // TODO There is no point in having separate virial matrix for constraints
+    clear_mat(virial);
 
-UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir   */,
-                                       const gmx_mtop_t& /* mtop */,
-                                       const int /* numTempScaleValues */,
-                                       const DeviceContext& /* deviceContext */,
-                                       const DeviceStream& /* deviceStream */,
-                                       GpuEventSynchronizer* /* xUpdatedOnDevice */,
-                                       gmx_wallcycle* /*wcycle*/) :
-    impl_(nullptr)
+    // Make sure that the forces are ready on device before proceeding with the update.
+    fReadyOnDevice->enqueueWaitEvent(deviceStream_);
+
+    // The integrate should save a copy of the current coordinates in d_xp_ and write updated
+    // once into d_x_. The d_xp_ is only needed by constraints.
+    integrator_->integrate(
+            d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
+    // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
+    // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
+    // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
+    lincsGpu_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
+    settleGpu_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
+
+    // scaledVirial -> virial (methods above returns scaled values)
+    float scaleFactor = 0.5F / (dt * dt);
+    for (int i = 0; i < DIM; i++)
+    {
+        for (int j = 0; j < DIM; j++)
+        {
+            virial[i][j] = scaleFactor * virial[i][j];
+        }
+    }
+
+    coordinatesReady_->markEvent(deviceStream_);
+
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
+
+    return;
+}
+
+void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
+{
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+
+    ScalingMatrix mu(scalingMatrix);
+
+    launchScaleCoordinatesKernel(numAtoms_, d_x_, mu, deviceStream_);
+
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
+}
+
+void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix)
+{
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+
+    ScalingMatrix mu(scalingMatrix);
+
+    launchScaleCoordinatesKernel(numAtoms_, d_v_, mu, deviceStream_);
+
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
+}
+
+UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
+                               const gmx_mtop_t&     mtop,
+                               const int             numTempScaleValues,
+                               const DeviceContext&  deviceContext,
+                               const DeviceStream&   deviceStream,
+                               GpuEventSynchronizer* xUpdatedOnDevice,
+                               gmx_wallcycle*        wcycle) :
+    deviceContext_(deviceContext),
+    deviceStream_(deviceStream),
+    coordinatesReady_(xUpdatedOnDevice),
+    wcycle_(wcycle)
+{
+    GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
+
+    integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, deviceStream_, numTempScaleValues);
+    lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, deviceStream_);
+    settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, deviceStream_);
+}
+
+UpdateConstrainGpu::Impl::~Impl() {}
+
+void UpdateConstrainGpu::Impl::set(DeviceBuffer<Float3>          d_x,
+                                   DeviceBuffer<Float3>          d_v,
+                                   const DeviceBuffer<Float3>    d_f,
+                                   const InteractionDefinitions& idef,
+                                   const t_mdatoms&              md)
+{
+    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
+    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+
+    GMX_ASSERT(d_x, "Coordinates device buffer should not be null.");
+    GMX_ASSERT(d_v, "Velocities device buffer should not be null.");
+    GMX_ASSERT(d_f, "Forces device buffer should not be null.");
+
+    d_x_ = d_x;
+    d_v_ = d_v;
+    d_f_ = d_f;
+
+    numAtoms_ = md.nr;
+
+    reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, deviceContext_);
+
+    reallocateDeviceBuffer(
+            &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
+
+    // Integrator should also update something, but it does not even have a method yet
+    integrator_->set(numAtoms_, md.invmass, md.cTC);
+    lincsGpu_->set(idef, numAtoms_, md.invmass);
+    settleGpu_->set(idef);
+
+    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
+    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
+}
+
+void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box)
+{
+    // TODO wallcycle
+    setPbcAiuc(numPbcDimensions(pbcType), box, &pbcAiuc_);
+}
+
+GpuEventSynchronizer* UpdateConstrainGpu::Impl::getCoordinatesReadySync()
+{
+    return coordinatesReady_;
+}
+
+UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec&     ir,
+                                       const gmx_mtop_t&     mtop,
+                                       const int             numTempScaleValues,
+                                       const DeviceContext&  deviceContext,
+                                       const DeviceStream&   deviceStream,
+                                       GpuEventSynchronizer* xUpdatedOnDevice,
+                                       gmx_wallcycle*        wcycle) :
+    impl_(new Impl(ir, mtop, numTempScaleValues, deviceContext, deviceStream, xUpdatedOnDevice, wcycle))
 {
-    GMX_ASSERT(!impl_,
-               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
 }
 
 UpdateConstrainGpu::~UpdateConstrainGpu() = default;
 
-void UpdateConstrainGpu::integrate(GpuEventSynchronizer* /* fReadyOnDevice */,
-                                   const real /* dt */,
-                                   const bool /* updateVelocities */,
-                                   const bool /* computeVirial */,
-                                   tensor /* virialScaled */,
-                                   const bool /* doTemperatureScaling */,
-                                   gmx::ArrayRef<const t_grp_tcstat> /* tcstat */,
-                                   const bool /* doParrinelloRahman */,
-                                   const float /* dtPressureCouple */,
-                                   const matrix /* prVelocityScalingMatrix*/)
-{
-    GMX_ASSERT(!impl_,
-               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+void UpdateConstrainGpu::integrate(GpuEventSynchronizer*             fReadyOnDevice,
+                                   const real                        dt,
+                                   const bool                        updateVelocities,
+                                   const bool                        computeVirial,
+                                   tensor                            virialScaled,
+                                   const bool                        doTemperatureScaling,
+                                   gmx::ArrayRef<const t_grp_tcstat> tcstat,
+                                   const bool                        doParrinelloRahman,
+                                   const float                       dtPressureCouple,
+                                   const matrix                      prVelocityScalingMatrix)
+{
+    impl_->integrate(fReadyOnDevice,
+                     dt,
+                     updateVelocities,
+                     computeVirial,
+                     virialScaled,
+                     doTemperatureScaling,
+                     tcstat,
+                     doParrinelloRahman,
+                     dtPressureCouple,
+                     prVelocityScalingMatrix);
 }
 
-void UpdateConstrainGpu::scaleCoordinates(const matrix /* scalingMatrix */)
+void UpdateConstrainGpu::scaleCoordinates(const matrix scalingMatrix)
 {
-    GMX_ASSERT(!impl_,
-               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+    impl_->scaleCoordinates(scalingMatrix);
 }
 
-void UpdateConstrainGpu::scaleVelocities(const matrix /* scalingMatrix */)
+void UpdateConstrainGpu::scaleVelocities(const matrix scalingMatrix)
 {
-    GMX_ASSERT(!impl_,
-               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+    impl_->scaleVelocities(scalingMatrix);
 }
 
-void UpdateConstrainGpu::set(DeviceBuffer<RVec> /* d_x */,
-                             DeviceBuffer<RVec> /* d_v */,
-                             const DeviceBuffer<RVec> /* d_f */,
-                             const InteractionDefinitions& /* idef */,
-                             const t_mdatoms& /* md */)
+void UpdateConstrainGpu::set(DeviceBuffer<Float3>          d_x,
+                             DeviceBuffer<Float3>          d_v,
+                             const DeviceBuffer<Float3>    d_f,
+                             const InteractionDefinitions& idef,
+                             const t_mdatoms&              md)
 {
-    GMX_ASSERT(!impl_,
-               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+    impl_->set(d_x, d_v, d_f, idef, md);
 }
 
-void UpdateConstrainGpu::setPbc(const PbcType /* pbcType */, const matrix /* box */)
+void UpdateConstrainGpu::setPbc(const PbcType pbcType, const matrix box)
 {
-    GMX_ASSERT(!impl_,
-               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+    impl_->setPbc(pbcType, box);
 }
 
 GpuEventSynchronizer* UpdateConstrainGpu::getCoordinatesReadySync()
 {
-    GMX_ASSERT(!impl_,
-               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
-    return nullptr;
+    return impl_->getCoordinatesReadySync();
 }
 
-bool UpdateConstrainGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& /* mtop */)
+bool UpdateConstrainGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
 {
-    return false;
+    return LincsGpu::isNumCoupledConstraintsSupported(mtop);
 }
 
 } // namespace gmx
-
-#endif /* !GMX_GPU_CUDA */
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
deleted file mode 100644
index 7ef37c0940..0000000000
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ /dev/null
@@ -1,339 +0,0 @@
-/*
- * This file is part of the GROMACS molecular simulation package.
- *
- * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
- * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
- * and including many others, as listed in the AUTHORS file in the
- * top-level source directory and at http://www.gromacs.org.
- *
- * GROMACS is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public License
- * as published by the Free Software Foundation; either version 2.1
- * of the License, or (at your option) any later version.
- *
- * GROMACS is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with GROMACS; if not, see
- * http://www.gnu.org/licenses, or write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
- *
- * If you want to redistribute modifications to GROMACS, please
- * consider that scientific software is very special. Version
- * control is crucial - bugs must be traceable. We will be happy to
- * consider code for inclusion in the official distribution, but
- * derived work must not be called official GROMACS. Details are found
- * in the README & COPYING files - if they are missing, get the
- * official version at http://www.gromacs.org.
- *
- * To help us fund GROMACS development, we humbly ask that you cite
- * the research papers on the package. Check out http://www.gromacs.org.
- */
-/*! \internal \file
- *
- * \brief Implements update and constraints class using CUDA.
- *
- * The class combines Leap-Frog integrator with LINCS and SETTLE constraints.
- *
- * \todo The computational procedures in members should be integrated to improve
- *       computational performance.
- *
- * \author Artem Zhmurov <zhmurov@gmail.com>
- *
- * \ingroup module_mdlib
- */
-#include "gmxpre.h"
-
-#include "update_constrain_gpu_impl.h"
-
-#include <assert.h>
-#include <stdio.h>
-
-#include <cmath>
-
-#include <algorithm>
-
-#include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/device_context.h"
-#include "gromacs/gpu_utils/device_stream.h"
-#include "gromacs/gpu_utils/devicebuffer.h"
-#include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
-#include "gromacs/gpu_utils/gputraits.cuh"
-#include "gromacs/gpu_utils/vectype_ops.cuh"
-#include "gromacs/mdlib/leapfrog_gpu.h"
-#include "gromacs/mdlib/update_constrain_gpu.h"
-#include "gromacs/mdtypes/mdatom.h"
-#include "gromacs/timing/wallcycle.h"
-
-namespace gmx
-{
-/*!\brief Number of CUDA threads in a block
- *
- * \todo Check if using smaller block size will lead to better performance.
- */
-constexpr static int c_threadsPerBlock = 256;
-//! Maximum number of threads in a block (for __launch_bounds__)
-constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
-
-__launch_bounds__(c_maxThreadsPerBlock) __global__
-        static void scaleCoordinates_kernel(const int numAtoms,
-                                            float3* __restrict__ gm_x,
-                                            const ScalingMatrix scalingMatrix)
-{
-    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
-    if (threadIndex < numAtoms)
-    {
-        float3 x = gm_x[threadIndex];
-
-        x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
-        x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
-        x.z = scalingMatrix.zz * x.z;
-
-        gm_x[threadIndex] = x;
-    }
-}
-
-void UpdateConstrainGpu::Impl::integrate(GpuEventSynchronizer*             fReadyOnDevice,
-                                         const real                        dt,
-                                         const bool                        updateVelocities,
-                                         const bool                        computeVirial,
-                                         tensor                            virial,
-                                         const bool                        doTemperatureScaling,
-                                         gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                         const bool                        doParrinelloRahman,
-                                         const float                       dtPressureCouple,
-                                         const matrix                      prVelocityScalingMatrix)
-{
-    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
-    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
-
-    // Clearing virial matrix
-    // TODO There is no point in having separate virial matrix for constraints
-    clear_mat(virial);
-
-    // Make sure that the forces are ready on device before proceeding with the update.
-    fReadyOnDevice->enqueueWaitEvent(deviceStream_);
-
-    // The integrate should save a copy of the current coordinates in d_xp_ and write updated
-    // once into d_x_. The d_xp_ is only needed by constraints.
-    integrator_->integrate(
-            d_x_, d_xp_, d_v_, d_f_, dt, doTemperatureScaling, tcstat, doParrinelloRahman, dtPressureCouple, prVelocityScalingMatrix);
-    // Constraints need both coordinates before (d_x_) and after (d_xp_) update. However, after constraints
-    // are applied, the d_x_ can be discarded. So we intentionally swap the d_x_ and d_xp_ here to avoid the
-    // d_xp_ -> d_x_ copy after constraints. Note that the integrate saves them in the wrong order as well.
-    lincsGpu_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
-    settleGpu_->apply(d_xp_, d_x_, updateVelocities, d_v_, 1.0 / dt, computeVirial, virial, pbcAiuc_);
-
-    // scaledVirial -> virial (methods above returns scaled values)
-    float scaleFactor = 0.5f / (dt * dt);
-    for (int i = 0; i < DIM; i++)
-    {
-        for (int j = 0; j < DIM; j++)
-        {
-            virial[i][j] = scaleFactor * virial[i][j];
-        }
-    }
-
-    coordinatesReady_->markEvent(deviceStream_);
-
-    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
-    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-
-    return;
-}
-
-void UpdateConstrainGpu::Impl::scaleCoordinates(const matrix scalingMatrix)
-{
-    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
-    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
-
-    ScalingMatrix mu(scalingMatrix);
-
-    const auto kernelArgs = prepareGpuKernelArguments(
-            scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_x_, &mu);
-    launchGpuKernel(scaleCoordinates_kernel,
-                    coordinateScalingKernelLaunchConfig_,
-                    deviceStream_,
-                    nullptr,
-                    "scaleCoordinates_kernel",
-                    kernelArgs);
-    // TODO: Although this only happens on the pressure coupling steps, this synchronization
-    //       can affect the performance if nstpcouple is small.
-    deviceStream_.synchronize();
-
-    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
-    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-}
-
-void UpdateConstrainGpu::Impl::scaleVelocities(const matrix scalingMatrix)
-{
-    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
-    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
-
-    ScalingMatrix mu(scalingMatrix);
-
-    const auto kernelArgs = prepareGpuKernelArguments(
-            scaleCoordinates_kernel, coordinateScalingKernelLaunchConfig_, &numAtoms_, &d_v_, &mu);
-    launchGpuKernel(scaleCoordinates_kernel,
-                    coordinateScalingKernelLaunchConfig_,
-                    deviceStream_,
-                    nullptr,
-                    "scaleCoordinates_kernel",
-                    kernelArgs);
-    // TODO: Although this only happens on the pressure coupling steps, this synchronization
-    //       can affect the performance if nstpcouple is small.
-    deviceStream_.synchronize();
-
-    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
-    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-}
-
-UpdateConstrainGpu::Impl::Impl(const t_inputrec&     ir,
-                               const gmx_mtop_t&     mtop,
-                               const int             numTempScaleValues,
-                               const DeviceContext&  deviceContext,
-                               const DeviceStream&   deviceStream,
-                               GpuEventSynchronizer* xUpdatedOnDevice,
-                               gmx_wallcycle*        wcycle) :
-    deviceContext_(deviceContext),
-    deviceStream_(deviceStream),
-    coordinatesReady_(xUpdatedOnDevice),
-    wcycle_(wcycle)
-{
-    GMX_ASSERT(xUpdatedOnDevice != nullptr, "The event synchronizer can not be nullptr.");
-
-
-    integrator_ = std::make_unique<LeapFrogGpu>(deviceContext_, deviceStream_, numTempScaleValues);
-    lincsGpu_ = std::make_unique<LincsGpu>(ir.nLincsIter, ir.nProjOrder, deviceContext_, deviceStream_);
-    settleGpu_ = std::make_unique<SettleGpu>(mtop, deviceContext_, deviceStream_);
-
-    coordinateScalingKernelLaunchConfig_.blockSize[0]     = c_threadsPerBlock;
-    coordinateScalingKernelLaunchConfig_.blockSize[1]     = 1;
-    coordinateScalingKernelLaunchConfig_.blockSize[2]     = 1;
-    coordinateScalingKernelLaunchConfig_.sharedMemorySize = 0;
-}
-
-UpdateConstrainGpu::Impl::~Impl() {}
-
-void UpdateConstrainGpu::Impl::set(DeviceBuffer<Float3>          d_x,
-                                   DeviceBuffer<Float3>          d_v,
-                                   const DeviceBuffer<Float3>    d_f,
-                                   const InteractionDefinitions& idef,
-                                   const t_mdatoms&              md)
-{
-    // TODO wallcycle
-    wallcycle_start_nocount(wcycle_, WallCycleCounter::LaunchGpu);
-    wallcycle_sub_start(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
-
-    GMX_ASSERT(d_x != nullptr, "Coordinates device buffer should not be null.");
-    GMX_ASSERT(d_v != nullptr, "Velocities device buffer should not be null.");
-    GMX_ASSERT(d_f != nullptr, "Forces device buffer should not be null.");
-
-    d_x_ = d_x;
-    d_v_ = d_v;
-    d_f_ = d_f;
-
-    numAtoms_ = md.nr;
-
-    reallocateDeviceBuffer(&d_xp_, numAtoms_, &numXp_, &numXpAlloc_, deviceContext_);
-
-    reallocateDeviceBuffer(
-            &d_inverseMasses_, numAtoms_, &numInverseMasses_, &numInverseMassesAlloc_, deviceContext_);
-
-    // Integrator should also update something, but it does not even have a method yet
-    integrator_->set(numAtoms_, md.invmass, md.cTC);
-    lincsGpu_->set(idef, numAtoms_, md.invmass);
-    settleGpu_->set(idef);
-
-    coordinateScalingKernelLaunchConfig_.gridSize[0] =
-            (numAtoms_ + c_threadsPerBlock - 1) / c_threadsPerBlock;
-
-    wallcycle_sub_stop(wcycle_, WallCycleSubCounter::LaunchGpuUpdateConstrain);
-    wallcycle_stop(wcycle_, WallCycleCounter::LaunchGpu);
-}
-
-void UpdateConstrainGpu::Impl::setPbc(const PbcType pbcType, const matrix box)
-{
-    // TODO wallcycle
-    setPbcAiuc(numPbcDimensions(pbcType), box, &pbcAiuc_);
-}
-
-GpuEventSynchronizer* UpdateConstrainGpu::Impl::getCoordinatesReadySync()
-{
-    return coordinatesReady_;
-}
-
-UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec&     ir,
-                                       const gmx_mtop_t&     mtop,
-                                       const int             numTempScaleValues,
-                                       const DeviceContext&  deviceContext,
-                                       const DeviceStream&   deviceStream,
-                                       GpuEventSynchronizer* xUpdatedOnDevice,
-                                       gmx_wallcycle*        wcycle) :
-    impl_(new Impl(ir, mtop, numTempScaleValues, deviceContext, deviceStream, xUpdatedOnDevice, wcycle))
-{
-}
-
-UpdateConstrainGpu::~UpdateConstrainGpu() = default;
-
-void UpdateConstrainGpu::integrate(GpuEventSynchronizer*             fReadyOnDevice,
-                                   const real                        dt,
-                                   const bool                        updateVelocities,
-                                   const bool                        computeVirial,
-                                   tensor                            virialScaled,
-                                   const bool                        doTemperatureScaling,
-                                   gmx::ArrayRef<const t_grp_tcstat> tcstat,
-                                   const bool                        doParrinelloRahman,
-                                   const float                       dtPressureCouple,
-                                   const matrix                      prVelocityScalingMatrix)
-{
-    impl_->integrate(fReadyOnDevice,
-                     dt,
-                     updateVelocities,
-                     computeVirial,
-                     virialScaled,
-                     doTemperatureScaling,
-                     tcstat,
-                     doParrinelloRahman,
-                     dtPressureCouple,
-                     prVelocityScalingMatrix);
-}
-
-void UpdateConstrainGpu::scaleCoordinates(const matrix scalingMatrix)
-{
-    impl_->scaleCoordinates(scalingMatrix);
-}
-
-void UpdateConstrainGpu::scaleVelocities(const matrix scalingMatrix)
-{
-    impl_->scaleVelocities(scalingMatrix);
-}
-
-void UpdateConstrainGpu::set(DeviceBuffer<Float3>          d_x,
-                             DeviceBuffer<Float3>          d_v,
-                             const DeviceBuffer<Float3>    d_f,
-                             const InteractionDefinitions& idef,
-                             const t_mdatoms&              md)
-{
-    impl_->set(d_x, d_v, d_f, idef, md);
-}
-
-void UpdateConstrainGpu::setPbc(const PbcType pbcType, const matrix box)
-{
-    impl_->setPbc(pbcType, box);
-}
-
-GpuEventSynchronizer* UpdateConstrainGpu::getCoordinatesReadySync()
-{
-    return impl_->getCoordinatesReadySync();
-}
-
-bool UpdateConstrainGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
-{
-    return LincsGpu::isNumCoupledConstraintsSupported(mtop);
-}
-
-} // namespace gmx
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index 9b260d1a99..150c944aae 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -228,19 +228,6 @@ private:
     gmx_wallcycle* wcycle_ = nullptr;
 };
 
-/*! \brief Scaling matrix struct.
- *
- * \todo Should be generalized.
- */
-struct ScalingMatrix
-{
-    ScalingMatrix(const matrix m) :
-        xx(m[XX][XX]), yy(m[YY][YY]), zz(m[ZZ][ZZ]), yx(m[YY][XX]), zx(m[ZZ][XX]), zy(m[ZZ][YY])
-    {
-    }
-    float xx, yy, zz, yx, zx, zy;
-};
-
 } // namespace gmx
 
 #endif // GMX_MDLIB_UPDATE_CONSTRAIN_GPU_IMPL_H
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl_stubs.cpp b/src/gromacs/mdlib/update_constrain_gpu_impl_stubs.cpp
new file mode 100644
index 0000000000..38b55c7931
--- /dev/null
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl_stubs.cpp
@@ -0,0 +1,131 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Stub for update and constraints class CPU implementation.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "config.h"
+
+#include "gromacs/mdlib/update_constrain_gpu.h"
+#include "gromacs/utility/gmxassert.h"
+
+#if !GMX_GPU_CUDA && !GMX_GPU_SYCL
+
+namespace gmx
+{
+
+class UpdateConstrainGpu::Impl
+{
+};
+
+UpdateConstrainGpu::UpdateConstrainGpu(const t_inputrec& /* ir   */,
+                                       const gmx_mtop_t& /* mtop */,
+                                       const int /* numTempScaleValues */,
+                                       const DeviceContext& /* deviceContext */,
+                                       const DeviceStream& /* deviceStream */,
+                                       GpuEventSynchronizer* /* xUpdatedOnDevice */,
+                                       gmx_wallcycle* /*wcycle*/) :
+    impl_(nullptr)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
+UpdateConstrainGpu::~UpdateConstrainGpu() = default;
+
+void UpdateConstrainGpu::integrate(GpuEventSynchronizer* /* fReadyOnDevice */,
+                                   const real /* dt */,
+                                   const bool /* updateVelocities */,
+                                   const bool /* computeVirial */,
+                                   tensor /* virialScaled */,
+                                   const bool /* doTemperatureScaling */,
+                                   gmx::ArrayRef<const t_grp_tcstat> /* tcstat */,
+                                   const bool /* doParrinelloRahman */,
+                                   const float /* dtPressureCouple */,
+                                   const matrix /* prVelocityScalingMatrix*/)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
+void UpdateConstrainGpu::scaleCoordinates(const matrix /* scalingMatrix */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
+void UpdateConstrainGpu::scaleVelocities(const matrix /* scalingMatrix */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
+void UpdateConstrainGpu::set(DeviceBuffer<RVec> /* d_x */,
+                             DeviceBuffer<RVec> /* d_v */,
+                             const DeviceBuffer<RVec> /* d_f */,
+                             const InteractionDefinitions& /* idef */,
+                             const t_mdatoms& /* md */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
+void UpdateConstrainGpu::setPbc(const PbcType /* pbcType */, const matrix /* box */)
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+}
+
+GpuEventSynchronizer* UpdateConstrainGpu::getCoordinatesReadySync()
+{
+    GMX_ASSERT(!impl_,
+               "A CPU stub for UpdateConstrain was called instead of the correct implementation.");
+    return nullptr;
+}
+
+bool UpdateConstrainGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& /* mtop */)
+{
+    return false;
+}
+
+} // namespace gmx
+
+#endif /* !GMX_GPU_CUDA && !GMX_GPU_SYCL */
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.cu b/src/gromacs/mdlib/update_constrain_gpu_internal.cu
new file mode 100644
index 0000000000..5787bc476e
--- /dev/null
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal.cu
@@ -0,0 +1,105 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements backend-specific functions of the update-constraints in CUDA.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "update_constrain_gpu_internal.h"
+
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/typecasts.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+
+namespace gmx
+{
+
+/*!\brief Number of CUDA threads in a block
+ *
+ * \todo Check if using smaller block size will lead to better performance.
+ */
+constexpr static int c_threadsPerBlock = 256;
+//! Maximum number of threads in a block (for __launch_bounds__)
+constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+
+__launch_bounds__(c_maxThreadsPerBlock) __global__
+        static void scaleCoordinates_kernel(const int numAtoms,
+                                            float3* __restrict__ gm_x,
+                                            const ScalingMatrix scalingMatrix)
+{
+    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+    if (threadIndex < numAtoms)
+    {
+        float3 x = gm_x[threadIndex];
+
+        x.x = scalingMatrix.xx * x.x + scalingMatrix.yx * x.y + scalingMatrix.zx * x.z;
+        x.y = scalingMatrix.yy * x.y + scalingMatrix.zy * x.z;
+        x.z = scalingMatrix.zz * x.z;
+
+        gm_x[threadIndex] = x;
+    }
+}
+
+void launchScaleCoordinatesKernel(const int            numAtoms,
+                                  DeviceBuffer<Float3> d_coordinates,
+                                  const ScalingMatrix  mu,
+                                  const DeviceStream&  deviceStream)
+{
+    KernelLaunchConfig kernelLaunchConfig;
+
+    kernelLaunchConfig.blockSize[0]     = c_threadsPerBlock;
+    kernelLaunchConfig.blockSize[1]     = 1;
+    kernelLaunchConfig.blockSize[2]     = 1;
+    kernelLaunchConfig.sharedMemorySize = 0;
+
+    kernelLaunchConfig.gridSize[0] = (numAtoms + c_threadsPerBlock - 1) / c_threadsPerBlock;
+
+    const auto kernelArgs = prepareGpuKernelArguments(
+            scaleCoordinates_kernel, kernelLaunchConfig, &numAtoms, asFloat3Pointer(&d_coordinates), &mu);
+    launchGpuKernel(scaleCoordinates_kernel,
+                    kernelLaunchConfig,
+                    deviceStream,
+                    nullptr,
+                    "scaleCoordinates_kernel",
+                    kernelArgs);
+    // TODO: Although this only happens on the pressure coupling steps, this synchronization
+    //       can affect the performance if nstpcouple is small. See Issue #4018
+    deviceStream.synchronize();
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal.h b/src/gromacs/mdlib/update_constrain_gpu_internal.h
new file mode 100644
index 0000000000..804636f8f0
--- /dev/null
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declares GPU implementations of backend-specific update-constraints functions.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_UPDATE_CONSTRAIN_GPU_INTERNAL_H
+#define GMX_MDLIB_UPDATE_CONSTRAIN_GPU_INTERNAL_H
+
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/device_stream.h"
+#include "gromacs/gpu_utils/gputraits.h"
+
+class GpuEventSynchronizer;
+
+namespace gmx
+{
+
+/*! \brief Scaling matrix struct.
+ *
+ * \todo Should be generalized.
+ */
+struct ScalingMatrix
+{
+    ScalingMatrix(const matrix m) :
+        xx(m[XX][XX]), yy(m[YY][YY]), zz(m[ZZ][ZZ]), yx(m[YY][XX]), zx(m[ZZ][XX]), zy(m[ZZ][YY])
+    {
+    }
+    float xx, yy, zz, yx, zx, zy;
+};
+
+/*! \brief Launches positions of velocities scaling kernel.
+ *
+ * \param[in] numAtoms       Number of atoms in the system.
+ * \param[in] d_coordinates  Device buffer with position or velocities to be scaled.
+ * \param[in] mu             Scaling matrix.
+ * \param[in] deviceStream   Stream to launch kernel in.
+ */
+void launchScaleCoordinatesKernel(int                  numAtoms,
+                                  DeviceBuffer<Float3> d_coordinates,
+                                  const ScalingMatrix  mu,
+                                  const DeviceStream&  deviceStream);
+
+} // namespace gmx
+
+#endif // GMX_MDLIB_UPDATE_CONSTRAIN_GPU_INTERNAL_H
diff --git a/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
new file mode 100644
index 0000000000..72723bd99e
--- /dev/null
+++ b/src/gromacs/mdlib/update_constrain_gpu_internal_sycl.cpp
@@ -0,0 +1,61 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements backend-specific functions of the update-constraints in SYCL.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gmxpre.h"
+
+#include "update_constrain_gpu_internal.h"
+
+#include "gromacs/utility/gmxassert.h"
+
+namespace gmx
+{
+
+void launchScaleCoordinatesKernel(const int /* numAtoms */,
+                                  DeviceBuffer<Float3> /* d_coordinates */,
+                                  const ScalingMatrix /* mu */,
+                                  const DeviceStream& /* deviceStream */)
+{
+    // SYCL_TODO
+    GMX_RELEASE_ASSERT(false, "Coordinates scaling is not yet implemented in SYCL.");
+}
+
+} // namespace gmx
-- 
2.22.0