From: Andrey Alekseenko <al42and@gmail.com>
Date: Tue, 21 Sep 2021 18:54:59 +0000 (+0000)
Subject: SYCL PME Spread kernel
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=f190fa59b4a72390b2aea2e74b4960941098ce4d;p=alexxy%2Fgromacs.git

SYCL PME Spread kernel
---

diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt
index ce81582dcb..27dc116a28 100644
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -88,20 +88,21 @@ elseif (GMX_GPU_OPENCL)
 elseif (GMX_GPU_SYCL)
     # SYCL-TODO: proper implementation
     gmx_add_libgromacs_sources(
-        # Files that implement stubs
-        pme_gpu_sycl_stubs.cpp
         # GPU-specific sources
         pme_gpu.cpp
         pme_gpu_internal.cpp
+        pme_gpu_program_impl_sycl.cpp
+        pme_spread_sycl.cpp
         pme_gpu_timings.cpp
         )
     _gmx_add_files_to_property(SYCL_SOURCES
         pme_gpu_internal.cpp
         pme_gpu_program.cpp
-        pme_gpu_sycl_stubs.cpp
+        pme_gpu_program_impl_sycl.cpp
         pme_gpu_3dfft_sycl.cpp
         pme_gpu_timings.cpp
-        )
+        pme_spread_sycl.cpp
+      )
 else()
     gmx_add_libgromacs_sources(
         # Files that implement stubs
diff --git a/src/gromacs/ewald/pme.cpp b/src/gromacs/ewald/pme.cpp
index 5eba296172..fe65e9d6e3 100644
--- a/src/gromacs/ewald/pme.cpp
+++ b/src/gromacs/ewald/pme.cpp
@@ -127,6 +127,8 @@
 #include "pme_spline_work.h"
 #include "pme_spread.h"
 
+bool g_allowPmeWithSyclForTesting = false;
+
 bool pme_gpu_supports_build(std::string* error)
 {
     gmx::MessageStringCollector errorReasons;
@@ -134,7 +136,7 @@ bool pme_gpu_supports_build(std::string* error)
     errorReasons.startContext("PME GPU does not support:");
     errorReasons.appendIf(GMX_DOUBLE, "Double-precision build of GROMACS.");
     errorReasons.appendIf(!GMX_GPU, "Non-GPU build of GROMACS.");
-    errorReasons.appendIf(GMX_GPU_SYCL, "SYCL build."); // SYCL-TODO
+    errorReasons.appendIf(GMX_GPU_SYCL && !g_allowPmeWithSyclForTesting, "SYCL build."); // SYCL-TODO
     errorReasons.finishContext();
     if (error != nullptr)
     {
@@ -196,7 +198,7 @@ static bool pme_gpu_check_restrictions(const gmx_pme_t* pme, std::string* error)
     errorReasons.appendIf(pme->doLJ, "Lennard-Jones PME.");
     errorReasons.appendIf(GMX_DOUBLE, "Double precision build of GROMACS.");
     errorReasons.appendIf(!GMX_GPU, "Non-GPU build of GROMACS.");
-    errorReasons.appendIf(GMX_GPU_SYCL, "SYCL build of GROMACS."); // SYCL-TODO
+    errorReasons.appendIf(GMX_GPU_SYCL && !g_allowPmeWithSyclForTesting, "SYCL build of GROMACS."); // SYCL-TODO
     errorReasons.finishContext();
     if (error != nullptr)
     {
diff --git a/src/gromacs/ewald/pme.h b/src/gromacs/ewald/pme.h
index 1e32caf265..0576e51b19 100644
--- a/src/gromacs/ewald/pme.h
+++ b/src/gromacs/ewald/pme.h
@@ -75,6 +75,18 @@ enum class GpuTaskCompletion;
 class PmeGpuProgram;
 class GpuEventSynchronizer;
 
+/*! \brief Hack to selectively enable some parts of PME during unit testing.
+ *
+ * Set to \c false by default. If any of the tests sets it to \c true, it will
+ * make the compatibility check consider PME to be supported in SYCL builds.
+ *
+ * Currently we don't have proper PME implementation with SYCL, but we still want
+ * to run tests for some of the kernels.
+ *
+ * \todo Remove after #3927 is done and PME is fully enabled in SYCL builds.
+ */
+extern bool g_allowPmeWithSyclForTesting;
+
 namespace gmx
 {
 template<typename>
diff --git a/src/gromacs/ewald/pme_gpu_calculate_splines_sycl.h b/src/gromacs/ewald/pme_gpu_calculate_splines_sycl.h
new file mode 100644
index 0000000000..998ce6b2e4
--- /dev/null
+++ b/src/gromacs/ewald/pme_gpu_calculate_splines_sycl.h
@@ -0,0 +1,403 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements helper routines for PME gather and spline routines.
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include <cassert>
+
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gputraits_sycl.h"
+#include "gromacs/gpu_utils/sycl_kernel_utils.h"
+
+#include "pme_grid.h"
+#include "pme_gpu_constants.h"
+#include "pme_gpu_types.h"
+
+namespace
+{
+
+/*! \brief Asserts if the argument is finite.
+ *
+ *  The function works for any data type, that can be casted to float. Note that there is also
+ *  a specialized implementation for float3 data type.
+ *
+ * \param[in] arg  Argument to check.
+ */
+template<typename T>
+inline void assertIsFinite(T arg);
+
+#if defined(NDEBUG) || GMX_SYCL_HIPSYCL
+// We have no cl::sycl::isfinite in hipSYCL yet
+template<typename T>
+inline void assertIsFinite(T /* arg */)
+{
+}
+#else
+template<>
+inline void assertIsFinite(Float3 gmx_used_in_debug arg)
+{
+    assert(cl::sycl::isfinite(arg[0]));
+    assert(cl::sycl::isfinite(arg[1]));
+    assert(cl::sycl::isfinite(arg[2]));
+}
+
+template<typename T>
+inline void assertIsFinite(T gmx_used_in_debug arg)
+{
+    assert(cl::sycl::isfinite(static_cast<float>(arg)));
+}
+#endif
+
+} // namespace
+
+using cl::sycl::access::fence_space;
+using cl::sycl::access::mode;
+using cl::sycl::access::target;
+
+/*! \internal \brief
+ * Gets a base of the unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The base only corresponds to the atom index within the execution block.
+ * Feed the result into getSplineParamIndex() to get a full index.
+ * TODO: it's likely that both parameters can be just replaced with a single atom index, as they are derived from it.
+ * Do that, verifying that the generated code is not bloated, and/or revise the spline indexing scheme.
+ * Removing warp dependency would also be nice (and would probably coincide with removing c_pmeSpreadGatherAtomsPerWarp).
+ *
+ * \tparam order                 PME order
+ * \tparam atomsPerSubGroup      Number of atoms processed by a sub group
+ * \param[in] subGroupIndex      Sub group index in the work group.
+ * \param[in] atomSubGroupIndex  Atom index in the sub group (from 0 to atomsPerSubGroup - 1).
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template<int order, int atomsPerSubGroup>
+static inline int getSplineParamIndexBase(int subGroupIndex, int atomSubGroupIndex)
+{
+    assert((atomSubGroupIndex >= 0) && (atomSubGroupIndex < atomsPerSubGroup));
+    constexpr int dimIndex    = 0;
+    constexpr int splineIndex = 0;
+    // The zeroes are here to preserve the full index formula for reference
+    return (((splineIndex + order * subGroupIndex) * DIM + dimIndex) * atomsPerSubGroup + atomSubGroupIndex);
+}
+
+/*! \internal \brief
+ * Gets a unique index to an element in a spline parameter buffer (theta/dtheta),
+ * which is laid out for GPU spread/gather kernels. The index is wrt to the execution block,
+ * in range(0, atomsPerBlock * order * DIM).
+ * This function consumes result of getSplineParamIndexBase() and adjusts it for \p dimIndex and \p splineIndex.
+ *
+ * \tparam order               PME order
+ * \tparam atomsPerSubGroup    Number of atoms processed by a sub group
+ * \param[in] paramIndexBase   Must be result of getSplineParamIndexBase().
+ * \param[in] dimIndex         Dimension index (from 0 to 2)
+ * \param[in] splineIndex      Spline contribution index (from 0 to \p order - 1)
+ *
+ * \returns Index into theta or dtheta array using GPU layout.
+ */
+template<int order, int atomsPerSubGroup>
+static inline int getSplineParamIndex(int paramIndexBase, int dimIndex, int splineIndex)
+{
+    assert((dimIndex >= XX) && (dimIndex < DIM));
+    assert((splineIndex >= 0) && (splineIndex < order));
+    return (paramIndexBase + (splineIndex * DIM + dimIndex) * atomsPerSubGroup);
+}
+
+/*! \internal \brief
+ * An inline function for skipping the zero-charge atoms when we have \c c_skipNeutralAtoms set to \c true.
+ *
+ * \returns                   \c true if atom should be processed, \c false otherwise.
+ * \param[in] charge          The atom charge.
+ */
+static inline bool pmeGpuCheckAtomCharge(const float charge)
+{
+    assertIsFinite(charge);
+    return c_skipNeutralAtoms ? (charge != 0.0F) : true;
+}
+
+/*! \brief
+ * General purpose function for loading atom-related data from global to shared memory.
+ *
+ * \tparam T Data type (float/int/...).
+ * \tparam atomsPerWorkGroup Number of atoms processed by a block - should be accounted for
+ *                           in the size of the shared memory array.
+ * \tparam dataCountPerAtom Number of data elements
+ *                          per single atom (e.g. \c DIM for an rvec coordinates array).
+ * \param[out] sm_destination Shared memory array for output.
+ * \param[in]  gm_source Global memory array for input.
+ * \param[in]  itemIdx SYCL thread ID.
+ */
+template<typename T, int atomsPerWorkGroup, int dataCountPerAtom>
+static inline void pmeGpuStageAtomData(cl::sycl::local_ptr<T>        sm_destination,
+                                       const cl::sycl::global_ptr<T> gm_source,
+                                       cl::sycl::nd_item<3>          itemIdx)
+{
+    const int blockIndex      = itemIdx.get_group_linear_id();
+    const int localIndex      = itemIdx.get_local_linear_id();
+    const int globalIndexBase = blockIndex * atomsPerWorkGroup * dataCountPerAtom;
+    const int globalIndex     = globalIndexBase + localIndex;
+    if (localIndex < atomsPerWorkGroup * dataCountPerAtom)
+    {
+        assertIsFinite(gm_source[globalIndex]);
+        sm_destination[localIndex] = gm_source[globalIndex];
+    }
+}
+
+/*! \brief
+ * PME GPU spline parameter and gridline indices calculation.
+ * This corresponds to the CPU functions calc_interpolation_idx() and make_bsplines().
+ * First stage of the whole kernel.
+ *
+ * \tparam order                PME interpolation order.
+ * \tparam atomsPerBlock        Number of atoms processed by a block - should be accounted for
+ *                              in the sizes of the shared memory arrays.
+ * \tparam atomsPerWarp         Number of atoms processed by a warp
+ * \tparam writeSmDtheta        Bool controlling if the theta derivative should be written to
+ *                              shared memory. Enables calculation of dtheta if set.
+ * \tparam writeGlobal          A boolean which tells if the theta values and gridlines should
+ *                              be written to global memory. Enables calculation of dtheta if set.
+ * \tparam numGrids             The number of grids using the splines.
+ * \tparam subGroupSize         The size of a sub-group (warp).
+ * \param[in]  atomIndexOffset        Starting atom index for the execution block in the global
+ *                                    memory.
+ * \param[in]  atomX                  Coordinates of atom processed by thread.
+ * \param[in]  atomCharge             Charge/coefficient of atom processed by thread.
+ * \param[in]  tablesOffsets          Offsets for X/Y/Z components of \p gm_fractShiftsTable and
+ *                                    \p gm_gridlineIndicesTable.
+ * \param[in]  realGridSizeFP         Real-space grid dimensions, converted to floating point.
+ * \param[in]  currentRecipBox0       Current reciprocal (inverted unit cell) box, vector 1.
+ * \param[in]  currentRecipBox1       Current reciprocal (inverted unit cell) box, vector 2.
+ * \param[in]  currentRecipBox2       Current reciprocal (inverted unit cell) box, vector 3.
+ * \param[out] gm_theta               Atom spline values in the global memory.
+ *                                    Used only if \p writeGlobal is \c true.
+ * \param[out] gm_dtheta              Derivatives of atom spline values in the global memory.
+ *                                    Used only if \p writeGlobal is \c true.
+ * \param[out] gm_gridlineIndices     Atom gridline indices in the global memory.
+ *                                    Used only if \p writeGlobal is \c true.
+ * \param[in] gm_fractShiftsTable     Fractional shifts lookup table in the global memory.
+ * \param[in] gm_gridlineIndicesTable Gridline indices lookup table in the global memory.
+ * \param[out] sm_theta               Atom spline values in the local memory.
+ * \param[out] sm_dtheta              Derivatives of atom spline values in the local memory.
+ * \param[out] sm_gridlineIndices     Atom gridline indices in the local memory.
+ * \param[out] sm_fractCoords         Fractional coordinates in the local memory.
+ * \param[in]  itemIdx                SYCL thread ID.
+ */
+
+template<int order, int atomsPerBlock, int atomsPerWarp, bool writeSmDtheta, bool writeGlobal, int numGrids, int subGroupSize>
+static inline void calculateSplines(const int                         atomIndexOffset,
+                                    const Float3                      atomX,
+                                    const float                       atomCharge,
+                                    const gmx::IVec                   tablesOffsets,
+                                    const gmx::RVec                   realGridSizeFP,
+                                    const gmx::RVec                   currentRecipBox0,
+                                    const gmx::RVec                   currentRecipBox1,
+                                    const gmx::RVec                   currentRecipBox2,
+                                    cl::sycl::global_ptr<float>       gm_theta,
+                                    cl::sycl::global_ptr<float>       gm_dtheta,
+                                    cl::sycl::global_ptr<int>         gm_gridlineIndices,
+                                    const cl::sycl::global_ptr<float> gm_fractShiftsTable,
+                                    const cl::sycl::global_ptr<int>   gm_gridlineIndicesTable,
+                                    cl::sycl::local_ptr<float>        sm_theta,
+                                    cl::sycl::local_ptr<float>        sm_dtheta,
+                                    cl::sycl::local_ptr<int>          sm_gridlineIndices,
+                                    cl::sycl::local_ptr<float>        sm_fractCoords,
+                                    cl::sycl::nd_item<3>              itemIdx)
+{
+    static_assert(numGrids == 1 || numGrids == 2);
+    static_assert(numGrids == 1 || c_skipNeutralAtoms == false);
+
+    /* Thread index w.r.t. block */
+    const int threadLocalId = itemIdx.get_local_linear_id();
+    /* Warp index w.r.t. block - could probably be obtained easier? */
+    const int warpIndex = threadLocalId / subGroupSize;
+    /* Atom index w.r.t. warp - alternating 0 1 0 1 ... */
+    const int atomWarpIndex = itemIdx.get_local_id(0) % atomsPerWarp;
+    /* Atom index w.r.t. block/shared memory */
+    const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
+
+    /* Spline contribution index in one dimension */
+    const int threadLocalIdXY =
+            (itemIdx.get_local_id(1) * itemIdx.get_group_range(2)) + itemIdx.get_local_id(2);
+    const int orderIndex = threadLocalIdXY / DIM;
+    /* Dimension index */
+    const int dimIndex = threadLocalIdXY % DIM;
+
+    /* Multi-purpose index of rvec/ivec atom data */
+    const int sharedMemoryIndex = atomIndexLocal * DIM + dimIndex;
+
+    float splineData[order];
+
+    const int localCheck = (dimIndex < DIM) && (orderIndex < 1);
+
+    /* we have 4 threads per atom, but can only use 3 here for the dimensions */
+    if (localCheck)
+    {
+        /* Indices interpolation */
+        if (orderIndex == 0)
+        {
+            int   tableIndex, tInt;
+            float n, t;
+            assert(atomIndexLocal < DIM * atomsPerBlock);
+            // Switch structure inherited from CUDA.
+            // TODO: Issue #4153: Direct indexing with dimIndex can be better with SYCL
+            switch (dimIndex)
+            {
+                case XX:
+                    tableIndex = tablesOffsets[XX];
+                    n          = realGridSizeFP[XX];
+                    t          = atomX[XX] * currentRecipBox0[XX] + atomX[YY] * currentRecipBox0[YY]
+                        + atomX[ZZ] * currentRecipBox0[ZZ];
+                    break;
+
+                case YY:
+                    tableIndex = tablesOffsets[YY];
+                    n          = realGridSizeFP[YY];
+                    t = atomX[YY] * currentRecipBox1[YY] + atomX[ZZ] * currentRecipBox1[ZZ];
+                    break;
+
+                case ZZ:
+                    tableIndex = tablesOffsets[ZZ];
+                    n          = realGridSizeFP[ZZ];
+                    t          = atomX[ZZ] * currentRecipBox2[ZZ];
+                    break;
+            }
+            const float shift = c_pmeMaxUnitcellShift;
+            /* Fractional coordinates along box vectors, adding a positive shift to ensure t is positive for triclinic boxes */
+            t    = (t + shift) * n;
+            tInt = static_cast<int>(t);
+            assert(sharedMemoryIndex < atomsPerBlock * DIM);
+            sm_fractCoords[sharedMemoryIndex] = t - tInt;
+            tableIndex += tInt;
+            assert(tInt >= 0);
+            assert(tInt < c_pmeNeighborUnitcellCount * n);
+
+            // TODO: Issue #4153: use shared table for both parameters to share the fetch, as index is always same.
+            sm_fractCoords[sharedMemoryIndex] += gm_fractShiftsTable[tableIndex];
+            sm_gridlineIndices[sharedMemoryIndex] = gm_gridlineIndicesTable[tableIndex];
+            if constexpr (writeGlobal)
+            {
+                gm_gridlineIndices[atomIndexOffset * DIM + sharedMemoryIndex] =
+                        sm_gridlineIndices[sharedMemoryIndex];
+            }
+        }
+
+        /* B-spline calculation */
+        const int chargeCheck = pmeGpuCheckAtomCharge(atomCharge);
+        /* With FEP (numGrids == 2), we might have 0 charge in state A, but !=0 in state B, so we always calculate splines */
+        if (numGrids == 2 || chargeCheck)
+        {
+            const float dr = sm_fractCoords[sharedMemoryIndex];
+            assertIsFinite(dr);
+
+            /* dr is relative offset from lower cell limit */
+            splineData[order - 1] = 0.0F;
+            splineData[1]         = dr;
+            splineData[0]         = 1.0F - dr;
+
+#pragma unroll
+            for (int k = 3; k < order; k++)
+            {
+                const float div   = 1.0F / (k - 1.0F);
+                splineData[k - 1] = div * dr * splineData[k - 2];
+#pragma unroll
+                for (int l = 1; l < (k - 1); l++)
+                {
+                    splineData[k - l - 1] =
+                            div * ((dr + l) * splineData[k - l - 2] + (k - l - dr) * splineData[k - l - 1]);
+                }
+                splineData[0] = div * (1.0F - dr) * splineData[0];
+            }
+
+            const int thetaIndexBase =
+                    getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+            const int thetaGlobalOffsetBase = atomIndexOffset * DIM * order;
+            /* only calculate dtheta if we are saving it to shared or global memory */
+            if constexpr (writeSmDtheta || writeGlobal)
+            {
+                /* Differentiation and storing the spline derivatives (dtheta) */
+#pragma unroll
+                for (int o = 0; o < order; o++)
+                {
+                    const int thetaIndex =
+                            getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
+
+                    const float dtheta = ((o > 0) ? splineData[o - 1] : 0.0F) - splineData[o];
+                    assertIsFinite(dtheta);
+                    assert(thetaIndex < order * DIM * atomsPerBlock);
+                    if constexpr (writeSmDtheta)
+                    {
+                        sm_dtheta[thetaIndex] = dtheta;
+                    }
+                    if constexpr (writeGlobal)
+                    {
+                        const int thetaGlobalIndex  = thetaGlobalOffsetBase + thetaIndex;
+                        gm_dtheta[thetaGlobalIndex] = dtheta;
+                    }
+                }
+            }
+
+            const float div       = 1.0F / (order - 1.0F);
+            splineData[order - 1] = div * dr * splineData[order - 2];
+#pragma unroll
+            for (int k = 1; k < (order - 1); k++)
+            {
+                splineData[order - k - 1] = div
+                                            * ((dr + k) * splineData[order - k - 2]
+                                               + (order - k - dr) * splineData[order - k - 1]);
+            }
+            splineData[0] = div * (1.0F - dr) * splineData[0];
+
+            /* Storing the spline values (theta) */
+#pragma unroll
+            for (int o = 0; o < order; o++)
+            {
+                const int thetaIndex =
+                        getSplineParamIndex<order, atomsPerWarp>(thetaIndexBase, dimIndex, o);
+                assert(thetaIndex < order * DIM * atomsPerBlock);
+                sm_theta[thetaIndex] = splineData[o];
+                assertIsFinite(sm_theta[thetaIndex]);
+                if constexpr (writeGlobal)
+                {
+                    const int thetaGlobalIndex = thetaGlobalOffsetBase + thetaIndex;
+                    gm_theta[thetaGlobalIndex] = splineData[o];
+                }
+            }
+        }
+    }
+}
diff --git a/src/gromacs/ewald/pme_gpu_program_impl.h b/src/gromacs/ewald/pme_gpu_program_impl.h
index 8a1e8afaf1..621a7b9631 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl.h
+++ b/src/gromacs/ewald/pme_gpu_program_impl.h
@@ -49,6 +49,7 @@
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/utility/classhelpers.h"
 
+class ISyclKernelFunctor;
 class DeviceContext;
 struct DeviceInformation;
 
@@ -86,7 +87,7 @@ struct PmeGpuProgramImpl
 #elif GMX_GPU_OPENCL
     using PmeKernelHandle = cl_kernel;
 #else
-    using PmeKernelHandle = void*;
+    using PmeKernelHandle = ISyclKernelFunctor*;
 #endif
 
     /*! \brief
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp
new file mode 100644
index 0000000000..07da0a848c
--- /dev/null
+++ b/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp
@@ -0,0 +1,179 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief
+ * Implements PmeGpuProgramImpl, which stores permanent PME GPU context-derived data,
+ * such as (compiled) kernel handles.
+ *
+ * \author Aleksei Iupinov <a.yupinov@gmail.com>
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ * \ingroup module_ewald
+ */
+#include "gmxpre.h"
+
+#include "gromacs/hardware/device_information.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/syclutils.h"
+
+#include "pme_gpu_program_impl.h"
+#include "pme_spread_sycl.h"
+
+#include "pme_gpu_constants.h"
+#include "pme_gpu_internal.h" // for GridOrdering enum
+#include "pme_gpu_types_host.h"
+
+// PME interpolation order
+constexpr int c_pmeOrder = 4;
+// These hardcoded spread/gather parameters refer to not-implemented PME GPU 2D decomposition in X/Y
+constexpr bool c_wrapX = true;
+constexpr bool c_wrapY = true;
+
+static int subGroupSizeFromVendor(const DeviceInformation& deviceInfo)
+{
+    switch (deviceInfo.deviceVendor)
+    {
+        case DeviceVendor::Amd: return 64;   // Handle RDNA2 devices, Issue #3972.
+        case DeviceVendor::Intel: return 16; // TODO: Choose best value, Issue #4153.
+        case DeviceVendor::Nvidia: return 32;
+        default: GMX_RELEASE_ASSERT(false, "Unknown device vendor"); return 0;
+    }
+}
+
+#define INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) \
+    extern template class PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, true, true, numGrids, writeGlobal, threadsPerAtom, subGroupSize>;
+
+#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize)                 \
+    INSTANTIATE_3(order, true, true, numGrids, true, threadsPerAtom, subGroupSize);  \
+    INSTANTIATE_3(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \
+    INSTANTIATE_3(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \
+    INSTANTIATE_3(order, true, true, numGrids, false, threadsPerAtom, subGroupSize);
+
+#define INSTANTIATE(order, subGroupSize)                                 \
+    INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize);        \
+    INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \
+    INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize);        \
+    INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize);
+
+#if GMX_SYCL_DPCPP
+INSTANTIATE(4, 16);
+#elif GMX_SYCL_HIPSYCL
+INSTANTIATE(4, 32);
+INSTANTIATE(4, 64);
+#endif
+
+
+//! Helper function to set proper kernel functor pointers
+template<int subGroupSize>
+static void setKernelPointers(struct PmeGpuProgramImpl* pmeGpuProgram)
+{
+    /* Not all combinations of the splineAndSpread, spline and Spread kernels are required
+     * If only the spline (without the spread) then it does not make sense not to write the data to global memory
+     * Similarly the spread kernel (without the spline) implies that we should read the spline data from global memory
+     */
+    pmeGpuProgram->splineAndSpreadKernelSingle =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->splineAndSpreadKernelThPerAtom4Single =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->splineAndSpreadKernelWriteSplinesSingle =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->splineAndSpreadKernelWriteSplinesThPerAtom4Single =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->splineKernelSingle =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->splineKernelThPerAtom4Single =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->spreadKernelSingle =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->spreadKernelThPerAtom4Single =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->splineAndSpreadKernelDual =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->splineAndSpreadKernelThPerAtom4Dual =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->splineAndSpreadKernelWriteSplinesDual =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->splineAndSpreadKernelWriteSplinesThPerAtom4Dual =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->splineKernelDual =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->splineKernelThPerAtom4Dual =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, true, false, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->spreadKernelDual =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->spreadKernelThPerAtom4Dual =
+            new PmeSplineAndSpreadKernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order, subGroupSize>();
+}
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
+    deviceContext_(deviceContext)
+{
+    // kernel parameters
+    warpSize_             = subGroupSizeFromVendor(deviceContext.deviceInfo());
+    spreadWorkGroupSize   = c_spreadMaxWarpsPerBlock * warpSize_;
+    solveMaxWorkGroupSize = c_solveMaxWarpsPerBlock * warpSize_;
+    gatherWorkGroupSize   = c_gatherMaxWarpsPerBlock * warpSize_;
+
+    switch (warpSize_)
+    {
+#if GMX_SYCL_DPCPP
+        case 16: setKernelPointers<16>(this); break;
+#elif GMX_SYCL_HIPSYCL
+        case 32: setKernelPointers<32>(this); break;
+        case 64: setKernelPointers<64>(this); break;
+#endif
+        default: GMX_RELEASE_ASSERT(false, "Invalid sub group size");
+    }
+}
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl()
+{
+    delete splineKernelSingle;
+    delete splineKernelThPerAtom4Single;
+    delete spreadKernelSingle;
+    delete spreadKernelThPerAtom4Single;
+    delete splineAndSpreadKernelSingle;
+    delete splineAndSpreadKernelThPerAtom4Single;
+    delete splineAndSpreadKernelWriteSplinesSingle;
+    delete splineAndSpreadKernelWriteSplinesThPerAtom4Single;
+    delete splineKernelDual;
+    delete splineKernelThPerAtom4Dual;
+    delete spreadKernelDual;
+    delete spreadKernelThPerAtom4Dual;
+    delete splineAndSpreadKernelDual;
+    delete splineAndSpreadKernelThPerAtom4Dual;
+    delete splineAndSpreadKernelWriteSplinesDual;
+    delete splineAndSpreadKernelWriteSplinesThPerAtom4Dual;
+}
diff --git a/src/gromacs/ewald/pme_spread_sycl.cpp b/src/gromacs/ewald/pme_spread_sycl.cpp
new file mode 100644
index 0000000000..c88c1de374
--- /dev/null
+++ b/src/gromacs/ewald/pme_spread_sycl.cpp
@@ -0,0 +1,501 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements PME GPU spline calculation and charge spreading in SYCL.
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include "pme_spread_sycl.h"
+
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gputraits_sycl.h"
+#include "gromacs/gpu_utils/sycl_kernel_utils.h"
+#include "gromacs/gpu_utils/syclutils.h"
+
+#include "pme_gpu_calculate_splines_sycl.h"
+#include "pme_grid.h"
+#include "pme_gpu_types_host.h"
+
+/*! \brief
+ * Charge spreading onto the grid.
+ * This corresponds to the CPU function spread_coefficients_bsplines_thread().
+ * Optional second stage of the spline_and_spread_kernel.
+ *
+ * \tparam     order                PME interpolation order.
+ * \tparam     wrapX                Whether the grid overlap in dimension X should be wrapped.
+ * \tparam     wrapY                Whether the grid overlap in dimension Y should be wrapped.
+ * \tparam     threadsPerAtom       How many threads work on each atom.
+ * \tparam     subGroupSize         Size of the sub-group.
+ *
+ * \param[in]  atomCharge           Atom charge/coefficient of atom processed by thread.
+ * \param[in]  realGridSize         Size of the real grid.
+ * \param[in]  realGridSizePadded   Padded of the real grid.
+ * \param[in,out]  gm_grid          Device pointer to the real grid to which charges are added.
+ * \param[in]  sm_gridlineIndices   Atom gridline indices in the local memory.
+ * \param[in]  sm_theta             Atom spline values in the local memory.
+ * \param[in]  itemIdx              Current thread ID.
+ */
+template<int order, bool wrapX, bool wrapY, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+inline void spread_charges(const float                      atomCharge,
+                           const int                        realGridSize[DIM],
+                           const int                        realGridSizePadded[DIM],
+                           cl::sycl::global_ptr<float>      gm_grid,
+                           const cl::sycl::local_ptr<int>   sm_gridlineIndices,
+                           const cl::sycl::local_ptr<float> sm_theta,
+                           const cl::sycl::nd_item<3>&      itemIdx)
+{
+    //! Number of atoms processed by a single warp in spread and gather
+    const int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
+    const int atomsPerWarp        = subGroupSize / threadsPerAtomValue;
+
+    const int nx  = realGridSize[XX];
+    const int ny  = realGridSize[YY];
+    const int nz  = realGridSize[ZZ];
+    const int pny = realGridSizePadded[YY];
+    const int pnz = realGridSizePadded[ZZ];
+
+    const int atomIndexLocal = itemIdx.get_local_id(0);
+
+    const int chargeCheck = pmeGpuCheckAtomCharge(atomCharge);
+
+    if (chargeCheck)
+    {
+        // Spline Z coordinates
+        const int ithz = itemIdx.get_local_id(2);
+
+        const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
+        const int iyBase = sm_gridlineIndices[atomIndexLocal * DIM + YY];
+        int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+        if (iz >= nz)
+        {
+            iz -= nz;
+        }
+        /* Atom index w.r.t. warp - alternating 0 1 0 1 ... */
+        const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
+        /* Warp index w.r.t. block - could probably be obtained easier? */
+        const int warpIndex = atomIndexLocal / atomsPerWarp;
+
+        const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+        const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
+        const float thetaZ     = sm_theta[splineIndexZ];
+
+        /* loop not used if order*order threads per atom */
+        const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : itemIdx.get_local_id(YY);
+        const int ithyMax =
+                (threadsPerAtom == ThreadsPerAtom::Order) ? order : itemIdx.get_local_id(YY) + 1;
+        for (int ithy = ithyMin; ithy < ithyMax; ithy++)
+        {
+            int iy = iyBase + ithy;
+            if (wrapY & (iy >= ny))
+            {
+                iy -= ny;
+            }
+
+            const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+            float       thetaY = sm_theta[splineIndexY];
+            const float Val    = thetaZ * thetaY * (atomCharge);
+            assertIsFinite(Val);
+            const int offset = iy * pnz + iz;
+
+#pragma unroll
+            for (int ithx = 0; (ithx < order); ithx++)
+            {
+                int ix = ixBase + ithx;
+                if (wrapX & (ix >= nx))
+                {
+                    ix -= nx;
+                }
+                const int gridIndexGlobal = ix * pny * pnz + offset;
+                const int splineIndexX =
+                        getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+                const float thetaX = sm_theta[splineIndexX];
+                assertIsFinite(thetaX);
+                assertIsFinite(gm_grid[gridIndexGlobal]);
+                atomicFetchAdd(gm_grid[gridIndexGlobal], thetaX * Val);
+            }
+        }
+    }
+}
+
+
+/*! \brief
+ * A spline computation and charge spreading kernel function.
+ *
+ * Two tuning parameters can be used for additional performance. For small systems and for debugging
+ * writeGlobal should be used removing the need to recalculate the theta values in the gather kernel.
+ * Similarly for large systems, with useOrderThreads, using order threads per atom gives higher
+ * performance than order*order threads.
+ *
+ * \tparam order          PME interpolation order.
+ * \tparam computeSplines A boolean which tells if the spline parameter and gridline indices'
+ *                        computation should be performed.
+ * \tparam spreadCharges  A boolean which tells if the charge spreading should be performed.
+ * \tparam wrapX          A boolean which tells if the grid overlap in dimension X should be wrapped.
+ * \tparam wrapY          A boolean which tells if the grid overlap in dimension Y should be wrapped.
+ * \tparam numGrids       The number of grids to use in the kernel. Can be 1 or 2.
+ * \tparam writeGlobal    A boolean which tells if the theta values and gridlines should be written
+ *                        to global memory.
+ * \tparam threadsPerAtom How many threads work on each atom.
+ * \tparam subGroupSize   Size of the sub-group.
+ */
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+auto pmeSplineAndSpreadKernel(
+        cl::sycl::handler&                                                        cgh,
+        const int                                                                 nAtoms,
+        OptionalAccessor<float, mode::read_write, spreadCharges>                  a_realGrid_0,
+        OptionalAccessor<float, mode::read_write, numGrids == 2 && spreadCharges> a_realGrid_1,
+        OptionalAccessor<float, mode::read_write, writeGlobal || computeSplines>  a_theta,
+        OptionalAccessor<float, mode::write, computeSplines && writeGlobal>       a_dtheta,
+        OptionalAccessor<int, mode::write, writeGlobal>                           a_gridlineIndices,
+        OptionalAccessor<float, mode::read, computeSplines>                 a_fractShiftsTable,
+        OptionalAccessor<int, mode::read, computeSplines>                   a_gridlineIndicesTable,
+        DeviceAccessor<float, mode::read>                                   a_coefficients_0,
+        OptionalAccessor<float, mode::read, numGrids == 2 && spreadCharges> a_coefficients_1,
+        OptionalAccessor<Float3, mode::read, computeSplines>                a_coordinates,
+        const gmx::IVec                                                     tablesOffsets,
+        const gmx::IVec                                                     realGridSize,
+        const gmx::RVec                                                     realGridSizeFP,
+        const gmx::IVec                                                     realGridSizePadded,
+        const gmx::RVec                                                     currentRecipBox0,
+        const gmx::RVec                                                     currentRecipBox1,
+        const gmx::RVec                                                     currentRecipBox2)
+{
+    constexpr int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
+    constexpr int spreadMaxThreadsPerBlock = c_spreadMaxWarpsPerBlock * subGroupSize;
+    constexpr int atomsPerBlock            = spreadMaxThreadsPerBlock / threadsPerAtomValue;
+    // Number of atoms processed by a single warp in spread and gather
+    static_assert(subGroupSize >= threadsPerAtomValue);
+    constexpr int atomsPerWarp = subGroupSize / threadsPerAtomValue;
+
+    if constexpr (spreadCharges)
+    {
+        cgh.require(a_realGrid_0);
+    }
+    if constexpr (writeGlobal || computeSplines)
+    {
+        cgh.require(a_theta);
+    }
+    if constexpr (computeSplines && writeGlobal)
+    {
+        cgh.require(a_dtheta);
+    }
+    if constexpr (writeGlobal)
+    {
+        cgh.require(a_gridlineIndices);
+    }
+    if constexpr (computeSplines)
+    {
+        cgh.require(a_fractShiftsTable);
+        cgh.require(a_gridlineIndicesTable);
+        cgh.require(a_coordinates);
+    }
+    cgh.require(a_coefficients_0);
+    if constexpr (numGrids == 2 && spreadCharges)
+    {
+        cgh.require(a_realGrid_1);
+        cgh.require(a_coefficients_1);
+    }
+
+    // Gridline indices, ivec
+    cl::sycl::accessor<int, 1, mode::read_write, target::local> sm_gridlineIndices(
+            cl::sycl::range<1>(atomsPerBlock * DIM), cgh);
+    // Charges
+    cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_coefficients(
+            cl::sycl::range<1>(atomsPerBlock), cgh);
+    // Spline values
+    cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_theta(
+            cl::sycl::range<1>(atomsPerBlock * DIM * order), cgh);
+    auto sm_fractCoords = [&]() {
+        if constexpr (computeSplines)
+        {
+            return cl::sycl::accessor<float, 1, mode::read_write, target::local>(
+                    cl::sycl::range<1>(atomsPerBlock * DIM), cgh);
+        }
+        else
+        {
+            return nullptr;
+        }
+    }();
+
+    return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
+    {
+        const int blockIndex      = itemIdx.get_group_linear_id();
+        const int atomIndexOffset = blockIndex * atomsPerBlock;
+
+        /* Thread index w.r.t. block */
+        const int threadLocalId = itemIdx.get_local_linear_id();
+        /* Warp index w.r.t. block - could probably be obtained easier? */
+        const int warpIndex = threadLocalId / subGroupSize;
+
+        /* Atom index w.r.t. warp */
+        const int atomWarpIndex = itemIdx.get_local_id(XX) % atomsPerWarp;
+        /* Atom index w.r.t. block/shared memory */
+        const int atomIndexLocal = warpIndex * atomsPerWarp + atomWarpIndex;
+        /* Atom index w.r.t. global memory */
+        const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+
+        /* Early return for fully empty blocks at the end
+         * (should only happen for billions of input atoms) */
+        if (atomIndexOffset >= nAtoms)
+        {
+            return;
+        }
+
+        /* Charges, required for both spline and spread */
+        pmeGpuStageAtomData<float, atomsPerBlock, 1>(
+                sm_coefficients.get_pointer(), a_coefficients_0.get_pointer(), itemIdx);
+        itemIdx.barrier(fence_space::local_space);
+        const float atomCharge = sm_coefficients[atomIndexLocal];
+
+        if constexpr (computeSplines)
+        {
+            // SYCL-TODO: Use prefetching? Issue #4153.
+            const Float3 atomX = a_coordinates[atomIndexGlobal];
+            // Lambdas below can be avoided when hipSYCL merges https://github.com/illuhad/hipSYCL/pull/629.
+            cl::sycl::global_ptr<float> gm_dtheta = [&]() {
+                if constexpr (writeGlobal)
+                {
+                    return a_dtheta.get_pointer();
+                }
+                else
+                {
+                    return nullptr;
+                }
+            }();
+            cl::sycl::global_ptr<int> gm_gridlineIndices = [&]() {
+                if constexpr (writeGlobal)
+                {
+                    return a_gridlineIndices.get_pointer();
+                }
+                else
+                {
+                    return nullptr;
+                }
+            }();
+            calculateSplines<order, atomsPerBlock, atomsPerWarp, false, writeGlobal, numGrids, subGroupSize>(
+                    atomIndexOffset,
+                    atomX,
+                    atomCharge,
+                    tablesOffsets,
+                    realGridSizeFP,
+                    currentRecipBox0,
+                    currentRecipBox1,
+                    currentRecipBox2,
+                    a_theta.get_pointer(),
+                    gm_dtheta,
+                    gm_gridlineIndices,
+                    a_fractShiftsTable.get_pointer(),
+                    a_gridlineIndicesTable.get_pointer(),
+                    sm_theta.get_pointer(),
+                    nullptr,
+                    sm_gridlineIndices.get_pointer(),
+                    sm_fractCoords.get_pointer(),
+                    itemIdx);
+            subGroupBarrier(itemIdx);
+        }
+        else
+        {
+            /* Staging the data for spread
+             * (the data is assumed to be in GPU global memory with proper layout already,
+             * as in after running the spline kernel)
+             */
+            /* Spline data - only thetas (dthetas will only be needed in gather) */
+            pmeGpuStageAtomData<float, atomsPerBlock, DIM * order>(
+                    sm_theta.get_pointer(), a_theta.get_pointer(), itemIdx);
+            /* Gridline indices */
+            pmeGpuStageAtomData<int, atomsPerBlock, DIM>(
+                    sm_gridlineIndices.get_pointer(), a_gridlineIndices.get_pointer(), itemIdx);
+
+            itemIdx.barrier(fence_space::local_space);
+        }
+
+        /* Spreading */
+        if constexpr (spreadCharges)
+        {
+            spread_charges<order, wrapX, wrapY, threadsPerAtom, subGroupSize>(
+                    atomCharge,
+                    realGridSize,
+                    realGridSizePadded,
+                    a_realGrid_0.get_pointer(),
+                    sm_gridlineIndices.get_pointer(),
+                    sm_theta.get_pointer(),
+                    itemIdx);
+        }
+        if constexpr (numGrids == 2 && spreadCharges)
+        {
+            itemIdx.barrier(fence_space::local_space);
+            pmeGpuStageAtomData<float, atomsPerBlock, 1>(
+                    sm_coefficients.get_pointer(), a_coefficients_1.get_pointer(), itemIdx);
+            itemIdx.barrier(fence_space::local_space);
+            const float atomCharge = sm_coefficients[atomIndexLocal];
+
+            spread_charges<order, wrapX, wrapY, threadsPerAtom, subGroupSize>(
+                    atomCharge,
+                    realGridSize,
+                    realGridSizePadded,
+                    a_realGrid_1.get_pointer(),
+                    sm_gridlineIndices.get_pointer(),
+                    sm_theta.get_pointer(),
+                    itemIdx);
+        }
+    };
+}
+
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, wrapX, wrapY, numGrids, writeGlobal, threadsPerAtom, subGroupSize>::PmeSplineAndSpreadKernel()
+{
+    reset();
+}
+
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+void PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, wrapX, wrapY, numGrids, writeGlobal, threadsPerAtom, subGroupSize>::setArg(
+        size_t argIndex,
+        void*  arg)
+{
+    if (argIndex == 0)
+    {
+        auto* params   = reinterpret_cast<PmeGpuKernelParams*>(arg);
+        gridParams_    = &params->grid;
+        atomParams_    = &params->atoms;
+        dynamicParams_ = &params->current;
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(argIndex == 0, "Trying to pass too many args to the kernel");
+    }
+}
+
+
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+cl::sycl::event
+PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, wrapX, wrapY, numGrids, writeGlobal, threadsPerAtom, subGroupSize>::launch(
+        const KernelLaunchConfig& config,
+        const DeviceStream&       deviceStream)
+{
+    GMX_RELEASE_ASSERT(gridParams_, "Can not launch the kernel before setting its args");
+    GMX_RELEASE_ASSERT(atomParams_, "Can not launch the kernel before setting its args");
+    GMX_RELEASE_ASSERT(dynamicParams_, "Can not launch the kernel before setting its args");
+
+    using kernelNameType =
+            PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, wrapX, wrapY, numGrids, writeGlobal, threadsPerAtom, subGroupSize>;
+
+    // SYCL has different multidimensional layout than OpenCL/CUDA.
+    const cl::sycl::range<3> localSize{ config.blockSize[2], config.blockSize[1], config.blockSize[0] };
+    const cl::sycl::range<3> groupRange{ config.gridSize[2], config.gridSize[1], config.gridSize[0] };
+    const cl::sycl::nd_range<3> range{ groupRange * localSize, localSize };
+
+    cl::sycl::queue q = deviceStream.stream();
+
+
+    cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
+        auto kernel =
+                pmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, wrapX, wrapY, numGrids, writeGlobal, threadsPerAtom, subGroupSize>(
+                        cgh,
+                        atomParams_->nAtoms,
+                        gridParams_->d_realGrid[0],
+                        gridParams_->d_realGrid[1],
+                        atomParams_->d_theta,
+                        atomParams_->d_dtheta,
+                        atomParams_->d_gridlineIndices,
+                        gridParams_->d_fractShiftsTable,
+                        gridParams_->d_gridlineIndicesTable,
+                        atomParams_->d_coefficients[0],
+                        atomParams_->d_coefficients[1],
+                        atomParams_->d_coordinates,
+                        gridParams_->tablesOffsets,
+                        gridParams_->realGridSize,
+                        gridParams_->realGridSizeFP,
+                        gridParams_->realGridSizePadded,
+                        dynamicParams_->recipBox[0],
+                        dynamicParams_->recipBox[1],
+                        dynamicParams_->recipBox[2]);
+        cgh.parallel_for<kernelNameType>(range, kernel);
+    });
+
+    // Delete set args, so we don't forget to set them before the next launch.
+    reset();
+
+    return e;
+}
+
+
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+void PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, wrapX, wrapY, numGrids, writeGlobal, threadsPerAtom, subGroupSize>::reset()
+{
+    gridParams_    = nullptr;
+    atomParams_    = nullptr;
+    dynamicParams_ = nullptr;
+}
+
+
+//! Kernel instantiations
+/* Disable the "explicit template instantiation 'PmeSplineAndSpreadKernel<...>' will emit a vtable in every
+ * translation unit [-Wweak-template-vtables]" warning.
+ * It is only explicitly instantiated in this translation unit, so we should be safe.
+ */
+#ifdef __clang__
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-template-vtables"
+#endif
+
+#define INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) \
+    template class PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, true, true, numGrids, writeGlobal, threadsPerAtom, subGroupSize>;
+
+#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize)                 \
+    INSTANTIATE_3(order, true, true, numGrids, true, threadsPerAtom, subGroupSize);  \
+    INSTANTIATE_3(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \
+    INSTANTIATE_3(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \
+    INSTANTIATE_3(order, true, true, numGrids, false, threadsPerAtom, subGroupSize);
+
+#define INSTANTIATE(order, subGroupSize)                                 \
+    INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize);        \
+    INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \
+    INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize);        \
+    INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize);
+
+#if GMX_SYCL_DPCPP
+INSTANTIATE(4, 16); // TODO: Choose best value, Issue #4153.
+#elif GMX_SYCL_HIPSYCL
+INSTANTIATE(4, 32);
+INSTANTIATE(4, 64);
+#endif
+
+#ifdef __clang__
+#    pragma clang diagnostic pop
+#endif
diff --git a/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp b/src/gromacs/ewald/pme_spread_sycl.h
similarity index 60%
rename from src/gromacs/ewald/pme_gpu_sycl_stubs.cpp
rename to src/gromacs/ewald/pme_spread_sycl.h
index 832395e084..fdf65fb8e4 100644
--- a/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp
+++ b/src/gromacs/ewald/pme_spread_sycl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Copyright (c) 2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -34,26 +34,34 @@
  */
 
 /*! \internal \file
- * \brief Implements stubs of high-level PME GPU functions for SYCL.
+ *  \brief Implements PME GPU spline calculation and charge spreading in SYCL.
+ *  TODO: consider always pre-sorting particles (as in DD case).
  *
- * \author Andrey Alekseenko <al42and@gmail.com>
- *
- * \ingroup module_ewald
+ *  \author Andrey Alekseenko <al42and@gmail.com>
  */
-#include "gmxpre.h"
 
-#include "gromacs/ewald/ewald_utils.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+
+#include "gromacs/gpu_utils/syclutils.h"
+
+#include "pme_grid.h"
+#include "pme_gpu_types_host.h"
 
-#include "pme_gpu_program_impl.h"
+struct PmeGpuGridParams;
+struct PmeGpuAtomParams;
+struct PmeGpuDynamicParams;
 
-PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
-    deviceContext_(deviceContext),
-    warpSize_(0),
-    spreadWorkGroupSize(0),
-    gatherWorkGroupSize(0),
-    solveMaxWorkGroupSize(0)
+template<int order, bool computeSplines, bool spreadCharges, bool wrapX, bool wrapY, int numGrids, bool writeGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+class PmeSplineAndSpreadKernel : public ISyclKernelFunctor
 {
-    // SYCL-TODO
-}
+public:
+    PmeSplineAndSpreadKernel();
+    void            setArg(size_t argIndex, void* arg) override;
+    cl::sycl::event launch(const KernelLaunchConfig& config, const DeviceStream& deviceStream) override;
 
-PmeGpuProgramImpl::~PmeGpuProgramImpl() = default;
+private:
+    PmeGpuGridParams*    gridParams_;
+    PmeGpuAtomParams*    atomParams_;
+    PmeGpuDynamicParams* dynamicParams_;
+    void                 reset();
+};
diff --git a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
index 98c70599f2..d56cc256b8 100644
--- a/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
+++ b/src/gromacs/ewald/tests/pmesplinespreadtest.cpp
@@ -85,7 +85,17 @@ public:
     PmeSplineAndSpreadTest() = default;
 
     //! Sets the programs once
-    static void SetUpTestSuite() { s_pmeTestHardwareContexts = createPmeTestHardwareContextList(); }
+    static void SetUpTestSuite()
+    {
+        s_pmeTestHardwareContexts    = createPmeTestHardwareContextList();
+        g_allowPmeWithSyclForTesting = true; // We support PmeSplineAndSpread with SYCL
+    }
+
+    static void TearDownTestSuite()
+    {
+        // Revert the value back.
+        g_allowPmeWithSyclForTesting = false;
+    }
 
     //! The test
     static void runTest()