From 68e1dd0108740bae8f6cec521feb9ce9a26c4dbd Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Thu, 22 Apr 2021 16:21:28 +0000
Subject: [PATCH] Make LINCS setup code platform-agnostic

---
 src/gromacs/mdlib/CMakeLists.txt              |  17 +-
 .../mdlib/{lincs_gpu.cu => lincs_gpu.cpp}     | 433 +---------------
 .../mdlib/{lincs_gpu.cuh => lincs_gpu.h}      |  27 +-
 src/gromacs/mdlib/lincs_gpu_internal.cu       | 466 ++++++++++++++++++
 src/gromacs/mdlib/lincs_gpu_internal.h        |  71 +++
 src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp |  68 +++
 src/gromacs/mdlib/tests/constrtestrunners.cu  |   2 +-
 .../mdlib/update_constrain_gpu_impl.cu        |   2 +-
 src/gromacs/mdlib/update_constrain_gpu_impl.h |   2 +-
 9 files changed, 658 insertions(+), 430 deletions(-)
 rename src/gromacs/mdlib/{lincs_gpu.cu => lincs_gpu.cpp} (60%)
 rename src/gromacs/mdlib/{lincs_gpu.cuh => lincs_gpu.h} (93%)
 create mode 100644 src/gromacs/mdlib/lincs_gpu_internal.cu
 create mode 100644 src/gromacs/mdlib/lincs_gpu_internal.h
 create mode 100644 src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp
diff --git a/src/gromacs/mdlib/CMakeLists.txt b/src/gromacs/mdlib/CMakeLists.txt
index 96b6be720f..623fcb31df 100644
--- a/src/gromacs/mdlib/CMakeLists.txt
+++ b/src/gromacs/mdlib/CMakeLists.txt
@@ -36,26 +36,37 @@
 add_library(mdlib INTERFACE)
 
 file(GLOB MDLIB_SOURCES *.cpp)
-# To avoid listing all the necessary files manually, we will remove SYCL-specfific one here:
-list(REMOVE_ITEM MDLIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp)
+# To avoid listing all the necessary files manually, we will remove SYCL-specfific files here:
+list(REMOVE_ITEM MDLIB_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/leapfrog_gpu_sycl.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/lincs_gpu_internal_sycl.cpp)
 
 set(MDLIB_SOURCES ${MDLIB_SOURCES} PARENT_SCOPE)
 if(GMX_GPU_CUDA)
     gmx_add_libgromacs_sources(
        leapfrog_gpu.cu
-       lincs_gpu.cu
+       lincs_gpu.cpp
+       lincs_gpu_internal.cu
        settle_gpu.cu
        update_constrain_gpu_impl.cu
        gpuforcereduction_impl.cu
        )
+    _gmx_add_files_to_property(CUDA_SOURCES
+       lincs_gpu.cpp
+       )
 endif()
 
 if(GMX_GPU_SYCL)
     gmx_add_libgromacs_sources(
         leapfrog_gpu_sycl.cpp
+        lincs_gpu.cpp
+        lincs_gpu_internal_sycl.cpp
     )
     _gmx_add_files_to_property(SYCL_SOURCES
         leapfrog_gpu_sycl.cpp
+        lincs_gpu.cpp
+        lincs_gpu_internal_sycl.cpp
     )
 endif()
 
diff --git a/src/gromacs/mdlib/lincs_gpu.cu b/src/gromacs/mdlib/lincs_gpu.cpp
similarity index 60%
rename from src/gromacs/mdlib/lincs_gpu.cu
rename to src/gromacs/mdlib/lincs_gpu.cpp
index 466c250f4c..96c0b8fc32 100644
--- a/src/gromacs/mdlib/lincs_gpu.cu
+++ b/src/gromacs/mdlib/lincs_gpu.cpp
@@ -34,11 +34,9 @@
  */
 /*! \internal \file
  *
- * \brief Implements LINCS using CUDA
+ * \brief Implementations of LINCS GPU class
  *
- * This file contains implementation of LINCS constraints algorithm
- * using CUDA, including class initialization, data-structures management
- * and GPU kernel.
+ * This file contains back-end agnostic implementation of LINCS GPU class.
  *
  * \author Artem Zhmurov <zhmurov@gmail.com>
  * \author Alan Gray <alang@nvidia.com>
@@ -47,7 +45,7 @@
  */
 #include "gmxpre.h"
 
-#include "lincs_gpu.cuh"
+#include "lincs_gpu.h"
 
 #include <assert.h>
 #include <stdio.h>
@@ -56,378 +54,19 @@
 
 #include <algorithm>
 
-#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
-#include "gromacs/gpu_utils/cudautils.cuh"
-#include "gromacs/gpu_utils/devicebuffer.cuh"
+#include "gromacs/gpu_utils/devicebuffer.h"
 #include "gromacs/gpu_utils/gputraits.h"
-#include "gromacs/gpu_utils/typecasts.cuh"
-#include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/math/functions.h"
 #include "gromacs/math/vec.h"
 #include "gromacs/mdlib/constr.h"
+#include "gromacs/mdlib/lincs_gpu_internal.h"
 #include "gromacs/pbcutil/pbc.h"
-#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
-#include "gromacs/topology/forcefieldparameters.h"
 #include "gromacs/topology/ifunc.h"
 #include "gromacs/topology/topology.h"
 
 namespace gmx
 {
 
-//! Number of CUDA threads in a block
-constexpr static int c_threadsPerBlock = 256;
-//! Maximum number of threads in a block (for __launch_bounds__)
-constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
-
-/*! \brief Main kernel for LINCS constraints.
- *
- * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
- *
- * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
- * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
- * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
- * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
- * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
- * there is no need to synchronize across the device. However, extensive communication in a thread block
- * are still needed.
- *
- * \todo Reduce synchronization overhead. Some ideas are:
- *        1. Consider going to warp-level synchronization for the coupled constraints.
- *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
- *           the device level).
- *        3. Use analytical solution for matrix A inversion.
- *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
- *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
- *       See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885)
- * \todo The use of __restrict__  for gm_xp and gm_v causes failure, probably because of the atomic
-         operations. Investigate this issue further.
- *
- * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
- * \param[in]     invdt         Inverse timestep (needed to update velocities).
- */
-template<bool updateVelocities, bool computeVirial>
-__launch_bounds__(c_maxThreadsPerBlock) __global__
-        void lincs_kernel(LincsGpuKernelParameters kernelParams,
-                          const float3* __restrict__ gm_x,
-                          float3*     gm_xp,
-                          float3*     gm_v,
-                          const float invdt)
-{
-    const PbcAiuc pbcAiuc                                 = kernelParams.pbcAiuc;
-    const int     numConstraintsThreads                   = kernelParams.numConstraintsThreads;
-    const int     numIterations                           = kernelParams.numIterations;
-    const int     expansionOrder                          = kernelParams.expansionOrder;
-    const int2* __restrict__ gm_constraints               = kernelParams.d_constraints;
-    const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
-    const int* __restrict__ gm_coupledConstraintsCounts   = kernelParams.d_coupledConstraintsCounts;
-    const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
-    const float* __restrict__ gm_massFactors           = kernelParams.d_massFactors;
-    float* __restrict__ gm_matrixA                     = kernelParams.d_matrixA;
-    const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
-    float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
-
-    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
-
-    // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
-    // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
-    assert(threadIndex < numConstraintsThreads);
-
-    // Vectors connecting constrained atoms before algorithm was applied.
-    // Needed to construct constrain matrix A
-    extern __shared__ float3 sm_r[];
-
-    int2 pair = gm_constraints[threadIndex];
-    int  i    = pair.x;
-    int  j    = pair.y;
-
-    // Mass-scaled Lagrange multiplier
-    float lagrangeScaled = 0.0f;
-
-    float targetLength;
-    float inverseMassi;
-    float inverseMassj;
-    float sqrtReducedMass;
-
-    float3 xi;
-    float3 xj;
-    float3 rc;
-
-    // i == -1 indicates dummy constraint at the end of the thread block.
-    bool isDummyThread = (i == -1);
-
-    // Everything computed for these dummies will be equal to zero
-    if (isDummyThread)
-    {
-        targetLength    = 0.0f;
-        inverseMassi    = 0.0f;
-        inverseMassj    = 0.0f;
-        sqrtReducedMass = 0.0f;
-
-        xi = make_float3(0.0f, 0.0f, 0.0f);
-        xj = make_float3(0.0f, 0.0f, 0.0f);
-        rc = make_float3(0.0f, 0.0f, 0.0f);
-    }
-    else
-    {
-        // Collecting data
-        targetLength    = gm_constraintsTargetLengths[threadIndex];
-        inverseMassi    = gm_inverseMasses[i];
-        inverseMassj    = gm_inverseMasses[j];
-        sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
-
-        xi = gm_x[i];
-        xj = gm_x[j];
-
-        float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
-
-        float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z);
-        rc         = rlen * dx;
-    }
-
-    sm_r[threadIdx.x] = rc;
-    // Make sure that all r's are saved into shared memory
-    // before they are accessed in the loop below
-    __syncthreads();
-
-    /*
-     * Constructing LINCS matrix (A)
-     */
-
-    // Only non-zero values are saved (for coupled constraints)
-    int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
-    for (int n = 0; n < coupledConstraintsCount; n++)
-    {
-        int index = n * numConstraintsThreads + threadIndex;
-        int c1    = gm_coupledConstraintsIdxes[index];
-
-        float3 rc1        = sm_r[c1 - blockIdx.x * blockDim.x];
-        gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
-    }
-
-    // Skipping in dummy threads
-    if (!isDummyThread)
-    {
-        xi = gm_xp[i];
-        xj = gm_xp[j];
-    }
-
-    float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
-
-    float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength);
-
-    /*
-     *  Inverse matrix using a set of expansionOrder matrix multiplications
-     */
-
-    // This will use the same memory space as sm_r, which is no longer needed.
-    extern __shared__ float sm_rhs[];
-    // Save current right-hand-side vector in the shared memory
-    sm_rhs[threadIdx.x] = sol;
-
-    for (int rec = 0; rec < expansionOrder; rec++)
-    {
-        // Making sure that all sm_rhs are saved before they are accessed in a loop below
-        __syncthreads();
-        float mvb = 0.0f;
-
-        for (int n = 0; n < coupledConstraintsCount; n++)
-        {
-            int index = n * numConstraintsThreads + threadIndex;
-            int c1    = gm_coupledConstraintsIdxes[index];
-            // Convolute current right-hand-side with A
-            // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
-            mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
-        }
-        // 'Switch' rhs vectors, save current result
-        // These values will be accessed in the loop above during the next iteration.
-        sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
-        sol                                                = sol + mvb;
-    }
-
-    // Current mass-scaled Lagrange multipliers
-    lagrangeScaled = sqrtReducedMass * sol;
-
-    // Save updated coordinates before correction for the rotational lengthening
-    float3 tmp = rc * lagrangeScaled;
-
-    // Writing for all but dummy constraints
-    if (!isDummyThread)
-    {
-        atomicAdd(&gm_xp[i], -tmp * inverseMassi);
-        atomicAdd(&gm_xp[j], tmp * inverseMassj);
-    }
-
-    /*
-     *  Correction for centripetal effects
-     */
-    for (int iter = 0; iter < numIterations; iter++)
-    {
-        // Make sure that all xp's are saved: atomic operation calls before are
-        // communicating current xp[..] values across thread block.
-        __syncthreads();
-
-        if (!isDummyThread)
-        {
-            xi = gm_xp[i];
-            xj = gm_xp[j];
-        }
-
-        float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
-
-        float len2  = targetLength * targetLength;
-        float dlen2 = 2.0f * len2 - norm2(dx);
-
-        // TODO A little bit more effective but slightly less readable version of the below would be:
-        //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
-        float proj;
-        if (dlen2 > 0.0f)
-        {
-            proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
-        }
-        else
-        {
-            proj = sqrtReducedMass * targetLength;
-        }
-
-        sm_rhs[threadIdx.x] = proj;
-        float sol           = proj;
-
-        /*
-         * Same matrix inversion as above is used for updated data
-         */
-        for (int rec = 0; rec < expansionOrder; rec++)
-        {
-            // Make sure that all elements of rhs are saved into shared memory
-            __syncthreads();
-            float mvb = 0;
-
-            for (int n = 0; n < coupledConstraintsCount; n++)
-            {
-                int index = n * numConstraintsThreads + threadIndex;
-                int c1    = gm_coupledConstraintsIdxes[index];
-
-                mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
-            }
-            sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
-            sol                                                = sol + mvb;
-        }
-
-        // Add corrections to Lagrange multipliers
-        float sqrtmu_sol = sqrtReducedMass * sol;
-        lagrangeScaled += sqrtmu_sol;
-
-        // Save updated coordinates for the next iteration
-        // Dummy constraints are skipped
-        if (!isDummyThread)
-        {
-            float3 tmp = rc * sqrtmu_sol;
-            atomicAdd(&gm_xp[i], -tmp * inverseMassi);
-            atomicAdd(&gm_xp[j], tmp * inverseMassj);
-        }
-    }
-
-    // Updating particle velocities for all but dummy threads
-    if (updateVelocities && !isDummyThread)
-    {
-        float3 tmp = rc * invdt * lagrangeScaled;
-        atomicAdd(&gm_v[i], -tmp * inverseMassi);
-        atomicAdd(&gm_v[j], tmp * inverseMassj);
-    }
-
-
-    if (computeVirial)
-    {
-        // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
-        // (targetLength) and the normalized vector connecting constrained atoms before
-        // the algorithm was applied (rc). The evaluation of virial in each thread is
-        // followed by basic reduction for the values inside single thread block.
-        // Then, the values are reduced across grid by atomicAdd(...).
-        //
-        // TODO Shuffle reduction.
-        // TODO Should be unified and/or done once when virial is actually needed.
-        // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
-        //      one that works for any datatype.
-
-        // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
-        // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
-        // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
-        // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
-        // two are no longer in use.
-        extern __shared__ float sm_threadVirial[];
-        float                   mult                  = targetLength * lagrangeScaled;
-        sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
-        sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y;
-        sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z;
-        sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y;
-        sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z;
-        sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z;
-
-        __syncthreads();
-
-        // Reduce up to one virial per thread block. All blocks are divided by half, the first
-        // half of threads sums two virials. Then the first half is divided by two and the first
-        // half of it sums two values. This procedure is repeated until only one thread is left.
-        // Only works if the threads per blocks is a power of two (hence static_assert
-        // in the beginning of the kernel).
-        for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
-        {
-            int dividedAt = blockDim.x / divideBy;
-            if (static_cast<int>(threadIdx.x) < dividedAt)
-            {
-                for (int d = 0; d < 6; d++)
-                {
-                    sm_threadVirial[d * blockDim.x + threadIdx.x] +=
-                            sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)];
-                }
-            }
-            // Syncronize if not within one warp
-            if (dividedAt > warpSize / 2)
-            {
-                __syncthreads();
-            }
-        }
-        // First 6 threads in the block add the results of 6 tensor components to the global memory address.
-        if (threadIdx.x < 6)
-        {
-            atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
-        }
-    }
-
-    return;
-}
-
-/*! \brief Select templated kernel.
- *
- * Returns pointer to a CUDA kernel based on provided booleans.
- *
- * \param[in] updateVelocities  If the velocities should be constrained.
- * \param[in] computeVirial     If virial should be updated.
- *
- * \return                      Pointer to CUDA kernel
- */
-inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial)
-{
-
-    auto kernelPtr = lincs_kernel<true, true>;
-    if (updateVelocities && computeVirial)
-    {
-        kernelPtr = lincs_kernel<true, true>;
-    }
-    else if (updateVelocities && !computeVirial)
-    {
-        kernelPtr = lincs_kernel<true, false>;
-    }
-    else if (!updateVelocities && computeVirial)
-    {
-        kernelPtr = lincs_kernel<false, true>;
-    }
-    else if (!updateVelocities && !computeVirial)
-    {
-        kernelPtr = lincs_kernel<false, false>;
-    }
-    return kernelPtr;
-}
-
 void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
                      DeviceBuffer<Float3>       d_xp,
                      const bool                 updateVelocities,
@@ -437,7 +76,7 @@ void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
                      tensor                     virialScaled,
                      const PbcAiuc              pbcAiuc)
 {
-    ensureNoPendingDeviceError("In CUDA version of LINCS");
+    GMX_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
 
     // Early exit if no constraints
     if (kernelParams_.numConstraintsThreads == 0)
@@ -452,48 +91,10 @@ void LincsGpu::apply(const DeviceBuffer<Float3> d_x,
         clearDeviceBufferAsync(&kernelParams_.d_virialScaled, 0, 6, deviceStream_);
     }
 
-    auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
-
-    KernelLaunchConfig config;
-    config.blockSize[0] = c_threadsPerBlock;
-    config.blockSize[1] = 1;
-    config.blockSize[2] = 1;
-    config.gridSize[0] = (kernelParams_.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock;
-    config.gridSize[1] = 1;
-    config.gridSize[2] = 1;
-
-    // Shared memory is used to store:
-    // -- Current coordinates (3 floats per thread)
-    // -- Right-hand-sides for matrix inversion (2 floats per thread)
-    // -- Virial tensor components (6 floats per thread)
-    // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
-    // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
-    // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
-    if (computeVirial)
-    {
-        config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
-    }
-    else
-    {
-        config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
-    }
-
     kernelParams_.pbcAiuc = pbcAiuc;
 
-    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
-                                                      config,
-                                                      &kernelParams_,
-                                                      asFloat3Pointer(&d_x),
-                                                      asFloat3Pointer(&d_xp),
-                                                      asFloat3Pointer(&d_v),
-                                                      &invdt);
-
-    launchGpuKernel(kernelPtr,
-                    config,
-                    deviceStream_,
-                    nullptr,
-                    "lincs_kernel<updateVelocities, computeVirial>",
-                    kernelArgs);
+    launchLincsGpuKernel(
+            kernelParams_, d_x, d_xp, updateVelocities, d_v, invdt, computeVirial, deviceStream_);
 
     if (computeVirial)
     {
@@ -530,6 +131,7 @@ LincsGpu::LincsGpu(int                  numIterations,
     deviceContext_(deviceContext),
     deviceStream_(deviceStream)
 {
+    GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
     kernelParams_.numIterations  = numIterations;
     kernelParams_.expansionOrder = expansionOrder;
 
@@ -662,7 +264,7 @@ inline int countCoupled(int           a,
  * \param[in]     iatoms              The list of constraints.
  * \param[in]     stride              Number of elements per constraint in \p iatoms.
  * \param[in]     atomsAdjacencyList  Information about connections between atoms.
- * \param[our]    splitMap            Map of sequential constraint indexes to indexes to be on the device
+ * \param[out]    splitMap            Map of sequential constraint indexes to indexes to be on the device
  * \param[in]     c                   Sequential index for constraint to consider adding.
  * \param[in,out] currentMapIndex     The rolling index for the constraints mapping.
  */
@@ -744,8 +346,9 @@ bool LincsGpu::isNumCoupledConstraintsSupported(const gmx_mtop_t& mtop)
 
 void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const real* invmass)
 {
+    GMX_RELEASE_ASSERT(GMX_GPU_CUDA, "LINCS GPU is only implemented in CUDA.");
     // List of constrained atoms (CPU memory)
-    std::vector<int2> constraintsHost;
+    std::vector<AtomPair> constraintsHost;
     // Equilibrium distances for the constraints (CPU)
     std::vector<float> constraintsTargetLengthsHost;
     // Number of constraints, coupled with the current one (CPU)
@@ -805,9 +408,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
                        "Number of threads should be a multiple of the block size");
 
     // Initialize constraints and their target indexes taking into account the splits in the data arrays.
-    int2 pair;
-    pair.x = -1;
-    pair.y = -1;
+    AtomPair pair;
+    pair.i = -1;
+    pair.j = -1;
     constraintsHost.resize(kernelParams_.numConstraintsThreads, pair);
     std::fill(constraintsHost.begin(), constraintsHost.end(), pair);
     constraintsTargetLengthsHost.resize(kernelParams_.numConstraintsThreads, 0.0);
@@ -818,9 +421,9 @@ void LincsGpu::set(const InteractionDefinitions& idef, const int numAtoms, const
         int a2   = iatoms[stride * c + 2];
         int type = iatoms[stride * c];
 
-        int2 pair;
-        pair.x                                          = a1;
-        pair.y                                          = a2;
+        AtomPair pair;
+        pair.i                                          = a1;
+        pair.j                                          = a2;
         constraintsHost.at(splitMap.at(c))              = pair;
         constraintsTargetLengthsHost.at(splitMap.at(c)) = idef.iparams[type].constr.dA;
     }
diff --git a/src/gromacs/mdlib/lincs_gpu.cuh b/src/gromacs/mdlib/lincs_gpu.h
similarity index 93%
rename from src/gromacs/mdlib/lincs_gpu.cuh
rename to src/gromacs/mdlib/lincs_gpu.h
index 6507892649..3ed6cd3e59 100644
--- a/src/gromacs/mdlib/lincs_gpu.cuh
+++ b/src/gromacs/mdlib/lincs_gpu.h
@@ -49,7 +49,7 @@
 #include "gromacs/gpu_utils/devicebuffer_datatype.h"
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/device_stream.h"
-#include "gromacs/gpu_utils/gputraits.cuh"
+#include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/mdlib/constr.h"
 #include "gromacs/pbcutil/pbc_aiuc.h"
 
@@ -58,6 +58,15 @@ class InteractionDefinitions;
 namespace gmx
 {
 
+//! A pair of atoms indexes
+struct AtomPair
+{
+    //! First atom
+    int i;
+    //! Second atom
+    int j;
+};
+
 /* \brief LINCS parameters and GPU pointers
  *
  * This is used to accumulate all the parameters and pointers so they can be passed
@@ -73,9 +82,9 @@ struct LincsGpuKernelParameters
     //! Number of iterations used to correct the projection
     int numIterations;
     //! 1/mass for all atoms (GPU)
-    float* d_inverseMasses;
+    DeviceBuffer<float> d_inverseMasses;
     //! Scaled virial tensor (6 floats: [XX, XY, XZ, YY, YZ, ZZ], GPU)
-    float* d_virialScaled;
+    DeviceBuffer<float> d_virialScaled;
     /*! \brief Total number of threads.
      *
      *  This covers all constraints and gaps in the ends of the thread blocks
@@ -84,17 +93,17 @@ struct LincsGpuKernelParameters
      */
     int numConstraintsThreads;
     //! List of constrained atoms (GPU memory)
-    int2* d_constraints;
+    DeviceBuffer<AtomPair> d_constraints;
     //! Equilibrium distances for the constraints (GPU)
-    float* d_constraintsTargetLengths;
+    DeviceBuffer<float> d_constraintsTargetLengths;
     //! Number of constraints, coupled with the current one (GPU)
-    int* d_coupledConstraintsCounts;
+    DeviceBuffer<int> d_coupledConstraintsCounts;
     //! List of coupled with the current one (GPU)
-    int* d_coupledConstraintsIndices;
+    DeviceBuffer<int> d_coupledConstraintsIndices;
     //! Elements of the coupling matrix.
-    float* d_matrixA;
+    DeviceBuffer<float> d_matrixA;
     //! Mass factors (GPU)
-    float* d_massFactors;
+    DeviceBuffer<float> d_massFactors;
 };
 
 /*! \internal \brief Class with interfaces and data for GPU version of LINCS. */
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.cu b/src/gromacs/mdlib/lincs_gpu_internal.cu
new file mode 100644
index 0000000000..55d4b48bad
--- /dev/null
+++ b/src/gromacs/mdlib/lincs_gpu_internal.cu
@@ -0,0 +1,466 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements LINCS kernels using CUDA
+ *
+ * This file contains CUDA kernels of LINCS constraints algorithm.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "gromacs/gpu_utils/cuda_arch_utils.cuh"
+#include "gromacs/gpu_utils/cudautils.cuh"
+#include "gromacs/gpu_utils/devicebuffer.cuh"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/gpu_utils/typecasts.cuh"
+#include "gromacs/gpu_utils/vectype_ops.cuh"
+#include "gromacs/mdlib/lincs_gpu.h"
+#include "gromacs/pbcutil/pbc_aiuc_cuda.cuh"
+
+#include "lincs_gpu_internal.h"
+
+namespace gmx
+{
+
+//! Maximum number of threads in a block (for __launch_bounds__)
+constexpr static int c_maxThreadsPerBlock = c_threadsPerBlock;
+
+/*! \brief Main kernel for LINCS constraints.
+ *
+ * See Hess et al., J. Comput. Chem. 18: 1463-1472 (1997) for the description of the algorithm.
+ *
+ * In CUDA version, one thread is responsible for all computations for one constraint. The blocks are
+ * filled in a way that no constraint is coupled to the constraint from the next block. This is achieved
+ * by moving active threads to the next block, if the correspondent group of coupled constraints is to big
+ * to fit the current thread block. This may leave some 'dummy' threads in the end of the thread block, i.e.
+ * threads that are not required to do actual work. Since constraints from different blocks are not coupled,
+ * there is no need to synchronize across the device. However, extensive communication in a thread block
+ * are still needed.
+ *
+ * \todo Reduce synchronization overhead. Some ideas are:
+ *        1. Consider going to warp-level synchronization for the coupled constraints.
+ *        2. Move more data to local/shared memory and try to get rid of atomic operations (at least on
+ *           the device level).
+ *        3. Use analytical solution for matrix A inversion.
+ *        4. Introduce mapping of thread id to both single constraint and single atom, thus designating
+ *           Nth threads to deal with Nat <= Nth coupled atoms and Nc <= Nth coupled constraints.
+ *       See Issue #2885 for details (https://gitlab.com/gromacs/gromacs/-/issues/2885)
+ * \todo The use of __restrict__  for gm_xp and gm_v causes failure, probably because of the atomic
+         operations. Investigate this issue further.
+ *
+ * \param[in,out] kernelParams  All parameters and pointers for the kernel condensed in single struct.
+ * \param[in]     invdt         Inverse timestep (needed to update velocities).
+ */
+template<bool updateVelocities, bool computeVirial>
+__launch_bounds__(c_maxThreadsPerBlock) __global__
+        void lincs_kernel(LincsGpuKernelParameters kernelParams,
+                          const float3* __restrict__ gm_x,
+                          float3*     gm_xp,
+                          float3*     gm_v,
+                          const float invdt)
+{
+    const PbcAiuc pbcAiuc                                 = kernelParams.pbcAiuc;
+    const int     numConstraintsThreads                   = kernelParams.numConstraintsThreads;
+    const int     numIterations                           = kernelParams.numIterations;
+    const int     expansionOrder                          = kernelParams.expansionOrder;
+    const AtomPair* __restrict__ gm_constraints           = kernelParams.d_constraints;
+    const float* __restrict__ gm_constraintsTargetLengths = kernelParams.d_constraintsTargetLengths;
+    const int* __restrict__ gm_coupledConstraintsCounts   = kernelParams.d_coupledConstraintsCounts;
+    const int* __restrict__ gm_coupledConstraintsIdxes = kernelParams.d_coupledConstraintsIndices;
+    const float* __restrict__ gm_massFactors           = kernelParams.d_massFactors;
+    float* __restrict__ gm_matrixA                     = kernelParams.d_matrixA;
+    const float* __restrict__ gm_inverseMasses         = kernelParams.d_inverseMasses;
+    float* __restrict__ gm_virialScaled                = kernelParams.d_virialScaled;
+
+    int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // numConstraintsThreads should be a integer multiple of blockSize (numConstraintsThreads = numBlocks*blockSize).
+    // This is to ensure proper synchronizations and reduction. All array are padded to the required size.
+    assert(threadIndex < numConstraintsThreads);
+
+    // Vectors connecting constrained atoms before algorithm was applied.
+    // Needed to construct constrain matrix A
+    extern __shared__ float3 sm_r[];
+
+    AtomPair pair = gm_constraints[threadIndex];
+    int      i    = pair.i;
+    int      j    = pair.j;
+
+    // Mass-scaled Lagrange multiplier
+    float lagrangeScaled = 0.0f;
+
+    float targetLength;
+    float inverseMassi;
+    float inverseMassj;
+    float sqrtReducedMass;
+
+    float3 xi;
+    float3 xj;
+    float3 rc;
+
+    // i == -1 indicates dummy constraint at the end of the thread block.
+    bool isDummyThread = (i == -1);
+
+    // Everything computed for these dummies will be equal to zero
+    if (isDummyThread)
+    {
+        targetLength    = 0.0f;
+        inverseMassi    = 0.0f;
+        inverseMassj    = 0.0f;
+        sqrtReducedMass = 0.0f;
+
+        xi = make_float3(0.0f, 0.0f, 0.0f);
+        xj = make_float3(0.0f, 0.0f, 0.0f);
+        rc = make_float3(0.0f, 0.0f, 0.0f);
+    }
+    else
+    {
+        // Collecting data
+        targetLength    = gm_constraintsTargetLengths[threadIndex];
+        inverseMassi    = gm_inverseMasses[i];
+        inverseMassj    = gm_inverseMasses[j];
+        sqrtReducedMass = rsqrt(inverseMassi + inverseMassj);
+
+        xi = gm_x[i];
+        xj = gm_x[j];
+
+        float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
+
+        float rlen = rsqrtf(dx.x * dx.x + dx.y * dx.y + dx.z * dx.z);
+        rc         = rlen * dx;
+    }
+
+    sm_r[threadIdx.x] = rc;
+    // Make sure that all r's are saved into shared memory
+    // before they are accessed in the loop below
+    __syncthreads();
+
+    /*
+     * Constructing LINCS matrix (A)
+     */
+
+    // Only non-zero values are saved (for coupled constraints)
+    int coupledConstraintsCount = gm_coupledConstraintsCounts[threadIndex];
+    for (int n = 0; n < coupledConstraintsCount; n++)
+    {
+        int index = n * numConstraintsThreads + threadIndex;
+        int c1    = gm_coupledConstraintsIdxes[index];
+
+        float3 rc1        = sm_r[c1 - blockIdx.x * blockDim.x];
+        gm_matrixA[index] = gm_massFactors[index] * (rc.x * rc1.x + rc.y * rc1.y + rc.z * rc1.z);
+    }
+
+    // Skipping in dummy threads
+    if (!isDummyThread)
+    {
+        xi = gm_xp[i];
+        xj = gm_xp[j];
+    }
+
+    float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
+
+    float sol = sqrtReducedMass * ((rc.x * dx.x + rc.y * dx.y + rc.z * dx.z) - targetLength);
+
+    /*
+     *  Inverse matrix using a set of expansionOrder matrix multiplications
+     */
+
+    // This will use the same memory space as sm_r, which is no longer needed.
+    extern __shared__ float sm_rhs[];
+    // Save current right-hand-side vector in the shared memory
+    sm_rhs[threadIdx.x] = sol;
+
+    for (int rec = 0; rec < expansionOrder; rec++)
+    {
+        // Making sure that all sm_rhs are saved before they are accessed in a loop below
+        __syncthreads();
+        float mvb = 0.0f;
+
+        for (int n = 0; n < coupledConstraintsCount; n++)
+        {
+            int index = n * numConstraintsThreads + threadIndex;
+            int c1    = gm_coupledConstraintsIdxes[index];
+            // Convolute current right-hand-side with A
+            // Different, non overlapping parts of sm_rhs[..] are read during odd and even iterations
+            mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
+        }
+        // 'Switch' rhs vectors, save current result
+        // These values will be accessed in the loop above during the next iteration.
+        sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
+        sol                                                = sol + mvb;
+    }
+
+    // Current mass-scaled Lagrange multipliers
+    lagrangeScaled = sqrtReducedMass * sol;
+
+    // Save updated coordinates before correction for the rotational lengthening
+    float3 tmp = rc * lagrangeScaled;
+
+    // Writing for all but dummy constraints
+    if (!isDummyThread)
+    {
+        atomicAdd(&gm_xp[i], -tmp * inverseMassi);
+        atomicAdd(&gm_xp[j], tmp * inverseMassj);
+    }
+
+    /*
+     *  Correction for centripetal effects
+     */
+    for (int iter = 0; iter < numIterations; iter++)
+    {
+        // Make sure that all xp's are saved: atomic operation calls before are
+        // communicating current xp[..] values across thread block.
+        __syncthreads();
+
+        if (!isDummyThread)
+        {
+            xi = gm_xp[i];
+            xj = gm_xp[j];
+        }
+
+        float3 dx = pbcDxAiuc(pbcAiuc, xi, xj);
+
+        float len2  = targetLength * targetLength;
+        float dlen2 = 2.0f * len2 - norm2(dx);
+
+        // TODO A little bit more effective but slightly less readable version of the below would be:
+        //      float proj = sqrtReducedMass*(targetLength - (dlen2 > 0.0f ? 1.0f : 0.0f)*dlen2*rsqrt(dlen2));
+        float proj;
+        if (dlen2 > 0.0f)
+        {
+            proj = sqrtReducedMass * (targetLength - dlen2 * rsqrt(dlen2));
+        }
+        else
+        {
+            proj = sqrtReducedMass * targetLength;
+        }
+
+        sm_rhs[threadIdx.x] = proj;
+        float sol           = proj;
+
+        /*
+         * Same matrix inversion as above is used for updated data
+         */
+        for (int rec = 0; rec < expansionOrder; rec++)
+        {
+            // Make sure that all elements of rhs are saved into shared memory
+            __syncthreads();
+            float mvb = 0;
+
+            for (int n = 0; n < coupledConstraintsCount; n++)
+            {
+                int index = n * numConstraintsThreads + threadIndex;
+                int c1    = gm_coupledConstraintsIdxes[index];
+
+                mvb = mvb + gm_matrixA[index] * sm_rhs[c1 - blockIdx.x * blockDim.x + blockDim.x * (rec % 2)];
+            }
+            sm_rhs[threadIdx.x + blockDim.x * ((rec + 1) % 2)] = mvb;
+            sol                                                = sol + mvb;
+        }
+
+        // Add corrections to Lagrange multipliers
+        float sqrtmu_sol = sqrtReducedMass * sol;
+        lagrangeScaled += sqrtmu_sol;
+
+        // Save updated coordinates for the next iteration
+        // Dummy constraints are skipped
+        if (!isDummyThread)
+        {
+            float3 tmp = rc * sqrtmu_sol;
+            atomicAdd(&gm_xp[i], -tmp * inverseMassi);
+            atomicAdd(&gm_xp[j], tmp * inverseMassj);
+        }
+    }
+
+    // Updating particle velocities for all but dummy threads
+    if (updateVelocities && !isDummyThread)
+    {
+        float3 tmp = rc * invdt * lagrangeScaled;
+        atomicAdd(&gm_v[i], -tmp * inverseMassi);
+        atomicAdd(&gm_v[j], tmp * inverseMassj);
+    }
+
+
+    if (computeVirial)
+    {
+        // Virial is computed from Lagrange multiplier (lagrangeScaled), target constrain length
+        // (targetLength) and the normalized vector connecting constrained atoms before
+        // the algorithm was applied (rc). The evaluation of virial in each thread is
+        // followed by basic reduction for the values inside single thread block.
+        // Then, the values are reduced across grid by atomicAdd(...).
+        //
+        // TODO Shuffle reduction.
+        // TODO Should be unified and/or done once when virial is actually needed.
+        // TODO Recursive version that removes atomicAdd(...)'s entirely is needed. Ideally,
+        //      one that works for any datatype.
+
+        // Save virial for each thread into the shared memory. Tensor is symmetrical, hence only
+        // 6 values are saved. Dummy threads will have zeroes in their virial: targetLength,
+        // lagrangeScaled and rc are all set to zero for them in the beginning of the kernel.
+        // The sm_threadVirial[..] will overlap with the sm_r[..] and sm_rhs[..], but the latter
+        // two are no longer in use.
+        extern __shared__ float sm_threadVirial[];
+        float                   mult                  = targetLength * lagrangeScaled;
+        sm_threadVirial[0 * blockDim.x + threadIdx.x] = mult * rc.x * rc.x;
+        sm_threadVirial[1 * blockDim.x + threadIdx.x] = mult * rc.x * rc.y;
+        sm_threadVirial[2 * blockDim.x + threadIdx.x] = mult * rc.x * rc.z;
+        sm_threadVirial[3 * blockDim.x + threadIdx.x] = mult * rc.y * rc.y;
+        sm_threadVirial[4 * blockDim.x + threadIdx.x] = mult * rc.y * rc.z;
+        sm_threadVirial[5 * blockDim.x + threadIdx.x] = mult * rc.z * rc.z;
+
+        __syncthreads();
+
+        // Reduce up to one virial per thread block. All blocks are divided by half, the first
+        // half of threads sums two virials. Then the first half is divided by two and the first
+        // half of it sums two values. This procedure is repeated until only one thread is left.
+        // Only works if the threads per blocks is a power of two (hence static_assert
+        // in the beginning of the kernel).
+        for (int divideBy = 2; divideBy <= static_cast<int>(blockDim.x); divideBy *= 2)
+        {
+            int dividedAt = blockDim.x / divideBy;
+            if (static_cast<int>(threadIdx.x) < dividedAt)
+            {
+                for (int d = 0; d < 6; d++)
+                {
+                    sm_threadVirial[d * blockDim.x + threadIdx.x] +=
+                            sm_threadVirial[d * blockDim.x + (threadIdx.x + dividedAt)];
+                }
+            }
+            // Syncronize if not within one warp
+            if (dividedAt > warpSize / 2)
+            {
+                __syncthreads();
+            }
+        }
+        // First 6 threads in the block add the results of 6 tensor components to the global memory address.
+        if (threadIdx.x < 6)
+        {
+            atomicAdd(&(gm_virialScaled[threadIdx.x]), sm_threadVirial[threadIdx.x * blockDim.x]);
+        }
+    }
+
+    return;
+}
+
+/*! \brief Select templated kernel.
+ *
+ * Returns pointer to a CUDA kernel based on provided booleans.
+ *
+ * \param[in] updateVelocities  If the velocities should be constrained.
+ * \param[in] computeVirial     If virial should be updated.
+ *
+ * \return                      Pointer to CUDA kernel
+ */
+inline auto getLincsKernelPtr(const bool updateVelocities, const bool computeVirial)
+{
+
+    auto kernelPtr = lincs_kernel<true, true>;
+    if (updateVelocities && computeVirial)
+    {
+        kernelPtr = lincs_kernel<true, true>;
+    }
+    else if (updateVelocities && !computeVirial)
+    {
+        kernelPtr = lincs_kernel<true, false>;
+    }
+    else if (!updateVelocities && computeVirial)
+    {
+        kernelPtr = lincs_kernel<false, true>;
+    }
+    else if (!updateVelocities && !computeVirial)
+    {
+        kernelPtr = lincs_kernel<false, false>;
+    }
+    return kernelPtr;
+}
+
+void launchLincsGpuKernel(LincsGpuKernelParameters&  kernelParams,
+                          const DeviceBuffer<Float3> d_x,
+                          DeviceBuffer<Float3>       d_xp,
+                          const bool                 updateVelocities,
+                          DeviceBuffer<Float3>       d_v,
+                          const real                 invdt,
+                          const bool                 computeVirial,
+                          const DeviceStream&        deviceStream)
+{
+
+    auto kernelPtr = getLincsKernelPtr(updateVelocities, computeVirial);
+
+    KernelLaunchConfig config;
+    config.blockSize[0] = c_threadsPerBlock;
+    config.blockSize[1] = 1;
+    config.blockSize[2] = 1;
+    config.gridSize[0] = (kernelParams.numConstraintsThreads + c_threadsPerBlock - 1) / c_threadsPerBlock;
+    config.gridSize[1] = 1;
+    config.gridSize[2] = 1;
+
+    // Shared memory is used to store:
+    // -- Current coordinates (3 floats per thread)
+    // -- Right-hand-sides for matrix inversion (2 floats per thread)
+    // -- Virial tensor components (6 floats per thread)
+    // Since none of these three are needed simultaneously, they can be saved at the same shared memory address
+    // (i.e. correspondent arrays are intentionally overlapped in address space). Consequently, only
+    // max{3, 2, 6} = 6 floats per thread are needed in case virial is computed, or max{3, 2} = 3 if not.
+    if (computeVirial)
+    {
+        config.sharedMemorySize = c_threadsPerBlock * 6 * sizeof(float);
+    }
+    else
+    {
+        config.sharedMemorySize = c_threadsPerBlock * 3 * sizeof(float);
+    }
+
+    const auto kernelArgs = prepareGpuKernelArguments(kernelPtr,
+                                                      config,
+                                                      &kernelParams,
+                                                      asFloat3Pointer(&d_x),
+                                                      asFloat3Pointer(&d_xp),
+                                                      asFloat3Pointer(&d_v),
+                                                      &invdt);
+
+    launchGpuKernel(kernelPtr,
+                    config,
+                    deviceStream,
+                    nullptr,
+                    "lincs_kernel<updateVelocities, computeVirial>",
+                    kernelArgs);
+
+    return;
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/lincs_gpu_internal.h b/src/gromacs/mdlib/lincs_gpu_internal.h
new file mode 100644
index 0000000000..5c42a784c6
--- /dev/null
+++ b/src/gromacs/mdlib/lincs_gpu_internal.h
@@ -0,0 +1,71 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Declare backend-specific LINCS GPU functions
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ * \author Alan Gray <alang@nvidia.com>
+ *
+ * \ingroup module_mdlib
+ */
+#ifndef GMX_MDLIB_LINCS_GPU_INTERNAL_H
+#define GMX_MDLIB_LINCS_GPU_INTERNAL_H
+
+#include "gromacs/gpu_utils/devicebuffer_datatype.h"
+#include "gromacs/gpu_utils/gputraits.h"
+
+class DeviceStream;
+
+namespace gmx
+{
+
+struct LincsGpuKernelParameters;
+
+//! Number of threads in a GPU block
+constexpr static int c_threadsPerBlock = 256;
+
+void launchLincsGpuKernel(LincsGpuKernelParameters&  kernelParams,
+                          const DeviceBuffer<Float3> d_x,
+                          DeviceBuffer<Float3>       d_xp,
+                          const bool                 updateVelocities,
+                          DeviceBuffer<Float3>       d_v,
+                          const real                 invdt,
+                          const bool                 computeVirial,
+                          const DeviceStream&        deviceStream);
+
+} // namespace gmx
+
+#endif // GMX_MDLIB_LINCS_GPU_INTERNAL_H
diff --git a/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp
new file mode 100644
index 0000000000..b5594c1bc0
--- /dev/null
+++ b/src/gromacs/mdlib/lincs_gpu_internal_sycl.cpp
@@ -0,0 +1,68 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *
+ * \brief Implements LINCS kernels using SYCL
+ *
+ * This file contains SYCL kernels of LINCS constraints algorithm.
+ *
+ * \author Artem Zhmurov <zhmurov@gmail.com>
+ *
+ * \ingroup module_mdlib
+ */
+#include "lincs_gpu_internal.h"
+
+#include "gromacs/utility/gmxassert.h"
+
+namespace gmx
+{
+
+void launchLincsGpuKernel(LincsGpuKernelParameters& /* kernelParams */,
+                          const DeviceBuffer<Float3> /* d_x */,
+                          DeviceBuffer<Float3> /* d_xp */,
+                          const bool /* updateVelocities */,
+                          DeviceBuffer<Float3> /* d_v */,
+                          const real /* invdt */,
+                          const bool /* computeVirial */,
+                          const DeviceStream& /* deviceStream */)
+{
+
+    // SYCL_TODO
+    GMX_RELEASE_ASSERT(false, "LINCS is not yet implemented in SYCL.");
+
+    return;
+}
+
+} // namespace gmx
diff --git a/src/gromacs/mdlib/tests/constrtestrunners.cu b/src/gromacs/mdlib/tests/constrtestrunners.cu
index c22b91f351..1f45f0ccab 100644
--- a/src/gromacs/mdlib/tests/constrtestrunners.cu
+++ b/src/gromacs/mdlib/tests/constrtestrunners.cu
@@ -54,7 +54,7 @@
 #include "gromacs/gpu_utils/devicebuffer.cuh"
 #include "gromacs/gpu_utils/gputraits.h"
 #include "gromacs/hardware/device_information.h"
-#include "gromacs/mdlib/lincs_gpu.cuh"
+#include "gromacs/mdlib/lincs_gpu.h"
 #include "gromacs/pbcutil/pbc.h"
 #include "gromacs/utility/unique_cptr.h"
 
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.cu b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
index 5a950d34e5..ba3a1d12af 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.cu
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.cu
@@ -64,7 +64,7 @@
 #include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/gpu_utils/vectype_ops.cuh"
 #include "gromacs/mdlib/leapfrog_gpu.h"
-#include "gromacs/mdlib/lincs_gpu.cuh"
+#include "gromacs/mdlib/lincs_gpu.h"
 #include "gromacs/mdlib/settle_gpu.cuh"
 #include "gromacs/mdlib/update_constrain_gpu.h"
 #include "gromacs/mdtypes/mdatom.h"
diff --git a/src/gromacs/mdlib/update_constrain_gpu_impl.h b/src/gromacs/mdlib/update_constrain_gpu_impl.h
index ece9859374..dcca5dc2a9 100644
--- a/src/gromacs/mdlib/update_constrain_gpu_impl.h
+++ b/src/gromacs/mdlib/update_constrain_gpu_impl.h
@@ -49,7 +49,7 @@
 #include "gmxpre.h"
 
 #include "gromacs/mdlib/leapfrog_gpu.h"
-#include "gromacs/mdlib/lincs_gpu.cuh"
+#include "gromacs/mdlib/lincs_gpu.h"
 #include "gromacs/mdlib/settle_gpu.cuh"
 #include "gromacs/mdlib/update_constrain_gpu.h"
 #include "gromacs/mdtypes/inputrec.h"
-- 
2.22.0