From f466950153163ea6d36fa01b315fa52045038a72 Mon Sep 17 00:00:00 2001
From: Andrey Alekseenko <al42and@gmail.com>
Date: Sat, 25 Sep 2021 09:55:00 +0000
Subject: [PATCH] SYCL PME Gather kernel

---
 src/gromacs/ewald/CMakeLists.txt              |   3 +-
 src/gromacs/ewald/pme_gather.cu               |   2 -
 src/gromacs/ewald/pme_gather_sycl.cpp         | 721 ++++++++++++++++++
 src/gromacs/ewald/pme_gather_sycl.h           |  66 ++
 .../ewald/pme_gpu_program_impl_sycl.cpp       |  51 +-
 src/gromacs/ewald/tests/pmegathertest.cpp     |   9 +-
 6 files changed, 837 insertions(+), 15 deletions(-)
 create mode 100644 src/gromacs/ewald/pme_gather_sycl.cpp
 create mode 100644 src/gromacs/ewald/pme_gather_sycl.h

diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt
index 27dc116a28..78d2a4ed02 100644
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -86,9 +86,9 @@ elseif (GMX_GPU_OPENCL)
         pme_gpu_timings.cpp
         )
 elseif (GMX_GPU_SYCL)
-    # SYCL-TODO: proper implementation
     gmx_add_libgromacs_sources(
         # GPU-specific sources
+        pme_gather_sycl.cpp
         pme_gpu.cpp
         pme_gpu_internal.cpp
         pme_gpu_program_impl_sycl.cpp
@@ -96,6 +96,7 @@ elseif (GMX_GPU_SYCL)
         pme_gpu_timings.cpp
         )
     _gmx_add_files_to_property(SYCL_SOURCES
+        pme_gather_sycl.cpp
         pme_gpu_internal.cpp
         pme_gpu_program.cpp
         pme_gpu_program_impl_sycl.cpp
diff --git a/src/gromacs/ewald/pme_gather.cu b/src/gromacs/ewald/pme_gather.cu
index 28de3817e5..3964b3e7c9 100644
--- a/src/gromacs/ewald/pme_gather.cu
+++ b/src/gromacs/ewald/pme_gather.cu
@@ -310,8 +310,6 @@ __device__ __forceinline__ void sumForceComponents(float* __restrict__ fx,
  * \param[in] gm_coefficients     Global memory array of the coefficients to use for an unperturbed
  * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
  * separate grids are used this should be the coefficients of the grid in question.
- * \param[in] gm_coefficientsB    Global memory array of the coefficients to use for FEP in state B.
- * Should be nullptr if two separate grids are used.
  */
 __device__ __forceinline__ void calculateAndStoreGridForces(float3* __restrict__ sm_forces,
                                                             const int   forceIndexLocal,
diff --git a/src/gromacs/ewald/pme_gather_sycl.cpp b/src/gromacs/ewald/pme_gather_sycl.cpp
new file mode 100644
index 0000000000..b03a7804b3
--- /dev/null
+++ b/src/gromacs/ewald/pme_gather_sycl.cpp
@@ -0,0 +1,721 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements PME force gathering in SYCL.
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ */
+
+#include "gmxpre.h"
+
+#include "pme_gather_sycl.h"
+
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gputraits_sycl.h"
+#include "gromacs/gpu_utils/sycl_kernel_utils.h"
+#include "gromacs/gpu_utils/syclutils.h"
+#include "gromacs/math/functions.h"
+
+#include "pme_gpu_calculate_splines_sycl.h"
+#include "pme_grid.h"
+#include "pme_gpu_constants.h"
+#include "pme_gpu_types_host.h"
+
+
+/*! \brief Reduce the partial force contributions.
+ *
+ * \tparam     order              The PME order (must be 4).
+ * \tparam     atomDataSize       The number of partial force contributions for each atom (currently
+ *                                order^2 == 16).
+ * \tparam     workGroupSize      The size of a work-group.
+ * \tparam     subGroupSize       The size of a sub-group.
+ *
+ * \param[in]  itemIdx            SYCL thread ID.
+ * \param[out] sm_forces          Shared memory array with the output forces (number of elements
+ *                                is number of atoms per block).
+ * \param[in]  atomIndexLocal     Local atom index.
+ * \param[in]  splineIndex        Spline index.
+ * \param[in]  lineIndex          Line index (same as threadLocalId)
+ * \param[in]  realGridSizeFP     Local grid size constant
+ * \param[in]  fx                 Input force partial component X
+ * \param[in]  fy                 Input force partial component Y
+ * \param[in]  fz                 Input force partial component Z
+ */
+template<int order, int atomDataSize, int workGroupSize, int subGroupSize>
+inline void reduceAtomForces(cl::sycl::nd_item<3>        itemIdx,
+                             cl::sycl::local_ptr<Float3> sm_forces,
+                             const int                   atomIndexLocal,
+                             const int                   splineIndex,
+                             const int gmx_unused        lineIndex,
+                             const float                 realGridSizeFP[3],
+                             float&                      fx, // NOLINT(google-runtime-references)
+                             float&                      fy, // NOLINT(google-runtime-references)
+                             float&                      fz)                      // NOLINT(google-runtime-references)
+{
+    static_assert(gmx::isPowerOfTwo(order));
+    // TODO: find out if this is the best in terms of transactions count
+    static_assert(order == 4, "Only order of 4 is implemented");
+
+    sycl_2020::sub_group sg = itemIdx.get_sub_group();
+
+    static_assert(atomDataSize <= subGroupSize,
+                  "TODO: rework for atomDataSize > subGroupSize (order 8 or larger)");
+
+    fx += sycl_2020::shift_left(sg, fx, 1);
+    fy += sycl_2020::shift_right(sg, fy, 1);
+    fz += sycl_2020::shift_left(sg, fz, 1);
+    if (splineIndex & 1)
+    {
+        fx = fy;
+    }
+    fx += sycl_2020::shift_left(sg, fx, 2);
+    fz += sycl_2020::shift_right(sg, fz, 2);
+    if (splineIndex & 2)
+    {
+        fx = fz;
+    }
+    // We have to just further reduce those groups of 4
+    for (int delta = 4; delta < atomDataSize; delta *= 2)
+    {
+        fx += sycl_2020::shift_left(sg, fx, delta);
+    }
+    const int dimIndex = splineIndex;
+    if (dimIndex < DIM)
+    {
+        const float n                       = realGridSizeFP[dimIndex];
+        sm_forces[atomIndexLocal][dimIndex] = fx * n;
+    }
+}
+
+/*! \brief Calculate the sum of the force partial components (in X, Y and Z)
+ *
+ * \tparam     order              The PME order (must be 4).
+ * \tparam     atomsPerWarp       The number of atoms per GPU warp.
+ * \tparam     wrapX              Tells if the grid is wrapped in the X dimension.
+ * \tparam     wrapY              Tells if the grid is wrapped in the Y dimension.
+ * \param[out] fx                 The force partial component in the X dimension.
+ * \param[out] fy                 The force partial component in the Y dimension.
+ * \param[out] fz                 The force partial component in the Z dimension.
+ * \param[in] ithyMin             The thread minimum index in the Y dimension.
+ * \param[in] ithyMax             The thread maximum index in the Y dimension.
+ * \param[in] ixBase              The grid line index base value in the X dimension.
+ * \param[in] iz                  The grid line index in the Z dimension.
+ * \param[in] nx                  The grid real size in the X dimension.
+ * \param[in] ny                  The grid real size in the Y dimension.
+ * \param[in] pny                 The padded grid real size in the Y dimension.
+ * \param[in] pnz                 The padded grid real size in the Z dimension.
+ * \param[in] atomIndexLocal      The atom index for this thread.
+ * \param[in] splineIndexBase     The base value of the spline parameter index.
+ * \param[in] tdz                 The theta and dtheta in the Z dimension.
+ * \param[in] sm_gridlineIndices  Shared memory array of grid line indices.
+ * \param[in] sm_theta            Shared memory array of atom theta values.
+ * \param[in] sm_dtheta           Shared memory array of atom dtheta values.
+ * \param[in] gm_grid             Global memory array of the grid to use.
+ */
+template<int order, int atomsPerWarp, bool wrapX, bool wrapY>
+inline void sumForceComponents(cl::sycl::private_ptr<float>      fx,
+                               cl::sycl::private_ptr<float>      fy,
+                               cl::sycl::private_ptr<float>      fz,
+                               const int                         ithyMin,
+                               const int                         ithyMax,
+                               const int                         ixBase,
+                               const int                         iz,
+                               const int                         nx,
+                               const int                         ny,
+                               const int                         pny,
+                               const int                         pnz,
+                               const int                         atomIndexLocal,
+                               const int                         splineIndexBase,
+                               const cl::sycl::float2            tdz,
+                               const cl::sycl::local_ptr<int>    sm_gridlineIndices,
+                               const cl::sycl::local_ptr<float>  sm_theta,
+                               const cl::sycl::local_ptr<float>  sm_dtheta,
+                               const cl::sycl::global_ptr<float> gm_grid)
+{
+    for (int ithy = ithyMin; ithy < ithyMax; ithy++)
+    {
+        const int splineIndexY = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, YY, ithy);
+        const cl::sycl::float2 tdy{ sm_theta[splineIndexY], sm_dtheta[splineIndexY] };
+
+        int iy = sm_gridlineIndices[atomIndexLocal * DIM + YY] + ithy;
+        if (wrapY & (iy >= ny))
+        {
+            iy -= ny;
+        }
+        const int constOffset = iy * pnz + iz;
+
+#pragma unroll
+        for (int ithx = 0; ithx < order; ithx++)
+        {
+            int ix = ixBase + ithx;
+            if (wrapX & (ix >= nx))
+            {
+                ix -= nx;
+            }
+            const int gridIndexGlobal = ix * pny * pnz + constOffset;
+            assert(gridIndexGlobal >= 0);
+            const float gridValue = gm_grid[gridIndexGlobal];
+            assertIsFinite(gridValue);
+            const int splineIndexX = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, XX, ithx);
+            const cl::sycl::float2 tdx{ sm_theta[splineIndexX], sm_dtheta[splineIndexX] };
+            const float            fxy1 = tdz[XX] * gridValue;
+            const float            fz1  = tdz[YY] * gridValue;
+            *fx += tdx[YY] * tdy[XX] * fxy1;
+            *fy += tdx[XX] * tdy[YY] * fxy1;
+            *fz += tdx[XX] * tdy[XX] * fz1;
+        }
+    }
+}
+
+
+/*! \brief Calculate the grid forces and store them in shared memory.
+ *
+ * \param[in,out] sm_forces       Shared memory array with the output forces.
+ * \param[in] forceIndexLocal     The local (per thread) index in the sm_forces array.
+ * \param[in] forceIndexGlobal    The index of the thread in the gm_coefficients array.
+ * \param[in] recipBox0           The reciprocal box (first vector).
+ * \param[in] recipBox1           The reciprocal box (second vector).
+ * \param[in] recipBox2           The reciprocal box (third vector).
+ * \param[in] scale               The scale to use when calculating the forces. For gm_coefficientsB
+ * (when using multiple coefficients on a single grid) the scale will be (1.0 - scale).
+ * \param[in] gm_coefficients     Global memory array of the coefficients to use for an unperturbed
+ * or FEP in state A if a single grid is used (\p multiCoefficientsSingleGrid == true).If two
+ * separate grids are used this should be the coefficients of the grid in question.
+ */
+inline void calculateAndStoreGridForces(cl::sycl::local_ptr<Float3>       sm_forces,
+                                        const int                         forceIndexLocal,
+                                        const int                         forceIndexGlobal,
+                                        const Float3&                     recipBox0,
+                                        const Float3&                     recipBox1,
+                                        const Float3&                     recipBox2,
+                                        const float                       scale,
+                                        const cl::sycl::global_ptr<float> gm_coefficients)
+{
+    const Float3 atomForces     = sm_forces[forceIndexLocal];
+    float        negCoefficient = -scale * gm_coefficients[forceIndexGlobal];
+    Float3       result;
+    result[XX] = negCoefficient * recipBox0[XX] * atomForces[XX];
+    result[YY] = negCoefficient * (recipBox0[YY] * atomForces[XX] + recipBox1[YY] * atomForces[YY]);
+    result[ZZ] = negCoefficient
+                 * (recipBox0[ZZ] * atomForces[XX] + recipBox1[ZZ] * atomForces[YY]
+                    + recipBox2[ZZ] * atomForces[ZZ]);
+    sm_forces[forceIndexLocal] = result;
+}
+
+/*! \brief
+ * A SYCL kernel which gathers the atom forces from the grid.
+ * The grid is assumed to be wrapped in dimension Z.
+ *
+ * \tparam order          PME interpolation order.
+ * \tparam wrapX          A boolean which tells if the grid overlap in dimension X should
+ *                        be wrapped.
+ * \tparam wrapY          A boolean which tells if the grid overlap in dimension Y should
+ *                        be wrapped.
+ * \tparam numGrids       The number of grids to use in the kernel. Can be 1 or 2.
+ * \tparam writeGlobal    Tells if we should read spline values from global memory.
+ * \tparam threadsPerAtom How many threads work on each atom.
+ * \tparam subGroupSize   Size of the sub-group.
+ */
+template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+auto pmeGatherKernel(cl::sycl::handler&                                 cgh,
+                     const int                                          nAtoms,
+                     DeviceAccessor<float, mode::read>                  a_gridA,
+                     OptionalAccessor<float, mode::read, numGrids == 2> a_gridB,
+                     DeviceAccessor<float, mode::read>                  a_coefficientsA,
+                     OptionalAccessor<float, mode::read, numGrids == 2> a_coefficientsB,
+                     OptionalAccessor<Float3, mode::read, !readGlobal>  a_coordinates,
+                     DeviceAccessor<Float3, mode::read_write>           a_forces,
+                     DeviceAccessor<float, mode::read>                  a_theta,
+                     DeviceAccessor<float, mode::read>                  a_dtheta,
+                     DeviceAccessor<int, mode::read>                    a_gridlineIndices,
+                     OptionalAccessor<float, mode::read, !readGlobal>   a_fractShiftsTable,
+                     OptionalAccessor<int, mode::read, !readGlobal>     a_gridlineIndicesTable,
+                     const gmx::IVec                                    tablesOffsets,
+                     const gmx::IVec                                    realGridSize,
+                     const gmx::RVec                                    realGridSizeFP,
+                     const gmx::IVec                                    realGridSizePadded,
+                     const gmx::RVec                                    currentRecipBox0,
+                     const gmx::RVec                                    currentRecipBox1,
+                     const gmx::RVec                                    currentRecipBox2,
+                     const float                                        scale)
+{
+    static_assert(numGrids == 1 || numGrids == 2);
+
+    constexpr int threadsPerAtomValue = (threadsPerAtom == ThreadsPerAtom::Order) ? order : order * order;
+    constexpr int atomDataSize        = threadsPerAtomValue;
+    constexpr int atomsPerBlock       = (c_gatherMaxWarpsPerBlock * subGroupSize) / atomDataSize;
+    // Number of atoms processed by a single warp in spread and gather
+    static_assert(subGroupSize >= atomDataSize);
+    constexpr int atomsPerWarp        = subGroupSize / atomDataSize;
+    constexpr int blockSize           = atomsPerBlock * atomDataSize;
+    constexpr int splineParamsSize    = atomsPerBlock * DIM * order;
+    constexpr int gridlineIndicesSize = atomsPerBlock * DIM;
+
+    cgh.require(a_gridA);
+    cgh.require(a_coefficientsA);
+    cgh.require(a_forces);
+
+    if constexpr (numGrids == 2)
+    {
+        cgh.require(a_gridB);
+        cgh.require(a_coefficientsB);
+    }
+
+    if constexpr (readGlobal)
+    {
+        cgh.require(a_theta);
+        cgh.require(a_dtheta);
+        cgh.require(a_gridlineIndices);
+    }
+    else
+    {
+        cgh.require(a_coordinates);
+        cgh.require(a_fractShiftsTable);
+        cgh.require(a_gridlineIndicesTable);
+    }
+
+    // Gridline indices, ivec
+    cl::sycl::accessor<int, 1, mode::read_write, target::local> sm_gridlineIndices(
+            cl::sycl::range<1>(atomsPerBlock * DIM), cgh);
+    // Spline values
+    cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_theta(
+            cl::sycl::range<1>(atomsPerBlock * DIM * order), cgh);
+    // Spline derivatives
+    cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_dtheta(
+            cl::sycl::range<1>(atomsPerBlock * DIM * order), cgh);
+    // Coefficients prefetch cache
+    cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_coefficients(
+            cl::sycl::range<1>(atomsPerBlock), cgh);
+    // Coordinates prefetch cache
+    cl::sycl::accessor<Float3, 1, mode::read_write, target::local> sm_coordinates(
+            cl::sycl::range<1>(atomsPerBlock), cgh);
+    // Reduction of partial force contributions
+    cl::sycl::accessor<Float3, 1, mode::read_write, target::local> sm_forces(
+            cl::sycl::range<1>(atomsPerBlock), cgh);
+
+    auto sm_fractCoords = [&]() {
+        if constexpr (!readGlobal)
+        {
+            return cl::sycl::accessor<float, 1, mode::read_write, target::local>(
+                    cl::sycl::range<1>(atomsPerBlock * DIM), cgh);
+        }
+        else
+        {
+            return nullptr;
+        }
+    }();
+
+    return [=](cl::sycl::nd_item<3> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
+    {
+        assert(blockSize == itemIdx.get_local_range().size());
+        /* These are the atom indices - for the shared and global memory */
+        const int atomIndexLocal = itemIdx.get_local_id(XX);
+        const int blockIndex =
+                itemIdx.get_group(YY) * itemIdx.get_group_range(ZZ) + itemIdx.get_group(ZZ);
+        // itemIdx.get_group_linear_id();
+
+        const int atomIndexOffset = blockIndex * atomsPerBlock;
+        const int atomIndexGlobal = atomIndexOffset + atomIndexLocal;
+        /* Early return for fully empty blocks at the end
+         * (should only happen for billions of input atoms)
+         */
+        if (atomIndexOffset >= nAtoms)
+        {
+            return;
+        }
+        /* Spline Z coordinates */
+        const int ithz = itemIdx.get_local_id(ZZ);
+        /* These are the spline contribution indices in shared memory */
+        const int splineIndex =
+                itemIdx.get_local_id(YY) * itemIdx.get_local_range(ZZ) + itemIdx.get_local_id(ZZ);
+
+        const int threadLocalId    = itemIdx.get_local_linear_id();
+        const int threadLocalIdMax = blockSize;
+        assert(threadLocalId < threadLocalIdMax);
+
+        const int lineIndex =
+                (itemIdx.get_local_id(XX) * (itemIdx.get_local_range(ZZ) * itemIdx.get_local_range(YY)))
+                + splineIndex; // And to all the block's particles
+        assert(lineIndex == threadLocalId);
+
+        if constexpr (readGlobal)
+        {
+            /* Read splines */
+            const int localGridlineIndicesIndex = threadLocalId;
+            const int globalGridlineIndicesIndex =
+                    blockIndex * gridlineIndicesSize + localGridlineIndicesIndex;
+            // itemIdx.get_group(ZZ) * gridlineIndicesSize + localGridlineIndicesIndex;
+            if (localGridlineIndicesIndex < gridlineIndicesSize)
+            {
+                sm_gridlineIndices[localGridlineIndicesIndex] =
+                        a_gridlineIndices[globalGridlineIndicesIndex];
+                assert(sm_gridlineIndices[localGridlineIndicesIndex] >= 0);
+            }
+            /* The loop needed for order threads per atom to make sure we load all data values, as each thread must load multiple values
+               with order*order threads per atom, it is only required for each thread to load one data value */
+
+            const int iMin = 0;
+            const int iMax = (threadsPerAtom == ThreadsPerAtom::Order) ? 3 : 1;
+
+            for (int i = iMin; i < iMax; i++)
+            {
+                // i will always be zero for order*order threads per atom
+                const int localSplineParamsIndex = threadLocalId + i * threadLocalIdMax;
+                const int globalSplineParamsIndex = blockIndex * splineParamsSize + localSplineParamsIndex;
+                // const int globalSplineParamsIndex = itemIdx.get_group(ZZ) * splineParamsSize + localSplineParamsIndex;
+                if (localSplineParamsIndex < splineParamsSize)
+                {
+                    sm_theta[localSplineParamsIndex]  = a_theta[globalSplineParamsIndex];
+                    sm_dtheta[localSplineParamsIndex] = a_dtheta[globalSplineParamsIndex];
+                    assertIsFinite(sm_theta[localSplineParamsIndex]);
+                    assertIsFinite(sm_dtheta[localSplineParamsIndex]);
+                }
+            }
+
+            itemIdx.barrier(fence_space::local_space);
+        }
+        else
+        {
+            /* Recalculate  Splines  */
+            /* Staging coefficients/charges */
+            pmeGpuStageAtomData<float, atomsPerBlock, 1>(
+                    sm_coefficients.get_pointer(), a_coefficientsA.get_pointer(), itemIdx);
+            /* Staging coordinates */
+            pmeGpuStageAtomData<Float3, atomsPerBlock, 1>(
+                    sm_coordinates.get_pointer(), a_coordinates.get_pointer(), itemIdx);
+            itemIdx.barrier(fence_space::local_space);
+            const Float3 atomX      = sm_coordinates[atomIndexLocal];
+            const float  atomCharge = sm_coefficients[atomIndexLocal];
+
+            calculateSplines<order, atomsPerBlock, atomsPerWarp, true, false, numGrids, subGroupSize>(
+                    atomIndexOffset,
+                    atomX,
+                    atomCharge,
+                    tablesOffsets,
+                    realGridSizeFP,
+                    currentRecipBox0,
+                    currentRecipBox1,
+                    currentRecipBox2,
+                    nullptr,
+                    nullptr,
+                    nullptr,
+                    a_fractShiftsTable.get_pointer(),
+                    a_gridlineIndicesTable.get_pointer(),
+                    sm_theta.get_pointer(),
+                    sm_dtheta.get_pointer(),
+                    sm_gridlineIndices.get_pointer(),
+                    sm_fractCoords.get_pointer(),
+                    itemIdx);
+            subGroupBarrier(itemIdx);
+        }
+        float fx = 0.0F;
+        float fy = 0.0F;
+        float fz = 0.0F;
+
+        const int chargeCheck = pmeGpuCheckAtomCharge(a_coefficientsA[atomIndexGlobal]);
+
+        const int nx  = realGridSize[XX];
+        const int ny  = realGridSize[YY];
+        const int nz  = realGridSize[ZZ];
+        const int pny = realGridSizePadded[YY];
+        const int pnz = realGridSizePadded[ZZ];
+
+        const int atomWarpIndex = atomIndexLocal % atomsPerWarp;
+        const int warpIndex     = atomIndexLocal / atomsPerWarp;
+
+        const int splineIndexBase = getSplineParamIndexBase<order, atomsPerWarp>(warpIndex, atomWarpIndex);
+        const int splineIndexZ = getSplineParamIndex<order, atomsPerWarp>(splineIndexBase, ZZ, ithz);
+        const cl::sycl::float2 tdz{ sm_theta[splineIndexZ], sm_dtheta[splineIndexZ] };
+
+        int       iz     = sm_gridlineIndices[atomIndexLocal * DIM + ZZ] + ithz;
+        const int ixBase = sm_gridlineIndices[atomIndexLocal * DIM + XX];
+
+        if (iz >= nz)
+        {
+            iz -= nz;
+        }
+
+        const int ithyMin = (threadsPerAtom == ThreadsPerAtom::Order) ? 0 : itemIdx.get_local_id(YY);
+        const int ithyMax =
+                (threadsPerAtom == ThreadsPerAtom::Order) ? order : itemIdx.get_local_id(YY) + 1;
+        if (chargeCheck)
+        {
+            sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
+                                                                  &fy,
+                                                                  &fz,
+                                                                  ithyMin,
+                                                                  ithyMax,
+                                                                  ixBase,
+                                                                  iz,
+                                                                  nx,
+                                                                  ny,
+                                                                  pny,
+                                                                  pnz,
+                                                                  atomIndexLocal,
+                                                                  splineIndexBase,
+                                                                  tdz,
+                                                                  sm_gridlineIndices.get_pointer(),
+                                                                  sm_theta.get_pointer(),
+                                                                  sm_dtheta.get_pointer(),
+                                                                  a_gridA.get_pointer());
+        }
+        reduceAtomForces<order, atomDataSize, blockSize, subGroupSize>(
+                itemIdx, sm_forces.get_pointer(), atomIndexLocal, splineIndex, lineIndex, realGridSizeFP, fx, fy, fz);
+        itemIdx.barrier(fence_space::local_space);
+
+        /* Calculating the final forces with no component branching, atomsPerBlock threads */
+        const int forceIndexLocal  = threadLocalId;
+        const int forceIndexGlobal = atomIndexOffset + forceIndexLocal;
+        if (forceIndexLocal < atomsPerBlock)
+        {
+            calculateAndStoreGridForces(sm_forces.get_pointer(),
+                                        forceIndexLocal,
+                                        forceIndexGlobal,
+                                        currentRecipBox0,
+                                        currentRecipBox1,
+                                        currentRecipBox2,
+                                        scale,
+                                        a_coefficientsA.get_pointer());
+        }
+        itemIdx.barrier(fence_space::local_space);
+
+        static_assert(atomsPerBlock <= subGroupSize);
+
+        /* Writing or adding the final forces component-wise, single warp */
+        constexpr int blockForcesSize = atomsPerBlock * DIM;
+        constexpr int numIter         = (blockForcesSize + subGroupSize - 1) / subGroupSize;
+        constexpr int iterThreads     = blockForcesSize / numIter;
+        if (threadLocalId < iterThreads)
+        {
+#pragma unroll
+            for (int i = 0; i < numIter; i++)
+            {
+                const int floatIndexLocal  = i * iterThreads + threadLocalId;
+                const int float3IndexLocal = floatIndexLocal / 3;
+                const int dimLocal         = floatIndexLocal % 3;
+                static_assert(blockForcesSize % DIM == 0); // Assures that dimGlobal == dimLocal
+                const int float3IndexGlobal = blockIndex * atomsPerBlock + float3IndexLocal;
+                // const int float3IndexGlobal = itemIdx.get_group(ZZ) * atomsPerBlock + float3IndexLocal;
+                a_forces[float3IndexGlobal][dimLocal] = sm_forces[float3IndexLocal][dimLocal];
+            }
+        }
+
+        if constexpr (numGrids == 2)
+        {
+            /* We must sync here since the same shared memory is used as above. */
+            itemIdx.barrier(fence_space::local_space);
+            fx                     = 0.0F;
+            fy                     = 0.0F;
+            fz                     = 0.0F;
+            const bool chargeCheck = pmeGpuCheckAtomCharge(a_coefficientsB[atomIndexGlobal]);
+            if (chargeCheck)
+            {
+                sumForceComponents<order, atomsPerWarp, wrapX, wrapY>(&fx,
+                                                                      &fy,
+                                                                      &fz,
+                                                                      ithyMin,
+                                                                      ithyMax,
+                                                                      ixBase,
+                                                                      iz,
+                                                                      nx,
+                                                                      ny,
+                                                                      pny,
+                                                                      pnz,
+                                                                      atomIndexLocal,
+                                                                      splineIndexBase,
+                                                                      tdz,
+                                                                      sm_gridlineIndices.get_pointer(),
+                                                                      sm_theta.get_pointer(),
+                                                                      sm_dtheta.get_pointer(),
+                                                                      a_gridB.get_pointer());
+            }
+            // Reduction of partial force contributions
+            reduceAtomForces<order, atomDataSize, blockSize, subGroupSize>(
+                    itemIdx, sm_forces.get_pointer(), atomIndexLocal, splineIndex, lineIndex, realGridSizeFP, fx, fy, fz);
+            itemIdx.barrier(fence_space::local_space);
+
+            /* Calculating the final forces with no component branching, atomsPerBlock threads */
+            if (forceIndexLocal < atomsPerBlock)
+            {
+                calculateAndStoreGridForces(sm_forces.get_pointer(),
+                                            forceIndexLocal,
+                                            forceIndexGlobal,
+                                            currentRecipBox0,
+                                            currentRecipBox1,
+                                            currentRecipBox2,
+                                            1.0F - scale,
+                                            a_coefficientsB.get_pointer());
+            }
+
+            itemIdx.barrier(fence_space::local_space);
+
+            /* Writing or adding the final forces component-wise, single warp */
+            if (threadLocalId < iterThreads)
+            {
+#pragma unroll
+                for (int i = 0; i < numIter; i++)
+                {
+                    const int floatIndexLocal  = i * iterThreads + threadLocalId;
+                    const int float3IndexLocal = floatIndexLocal / 3;
+                    const int dimLocal         = floatIndexLocal % 3;
+                    static_assert(blockForcesSize % DIM == 0); // Assures that dimGlobal == dimLocal
+                    const int float3IndexGlobal = blockIndex * atomsPerBlock + float3IndexLocal;
+                    a_forces[float3IndexGlobal][dimLocal] += sm_forces[float3IndexLocal][dimLocal];
+                }
+            }
+        }
+    };
+}
+
+template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+PmeGatherKernel<order, wrapX, wrapY, numGrids, readGlobal, threadsPerAtom, subGroupSize>::PmeGatherKernel()
+{
+    reset();
+}
+
+template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+void PmeGatherKernel<order, wrapX, wrapY, numGrids, readGlobal, threadsPerAtom, subGroupSize>::setArg(
+        size_t argIndex,
+        void*  arg)
+{
+    if (argIndex == 0)
+    {
+        auto* params   = reinterpret_cast<PmeGpuKernelParams*>(arg);
+        gridParams_    = &params->grid;
+        atomParams_    = &params->atoms;
+        dynamicParams_ = &params->current;
+    }
+    else
+    {
+        GMX_RELEASE_ASSERT(argIndex == 0, "Trying to pass too many args to the kernel");
+    }
+}
+
+
+template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+cl::sycl::event PmeGatherKernel<order, wrapX, wrapY, numGrids, readGlobal, threadsPerAtom, subGroupSize>::launch(
+        const KernelLaunchConfig& config,
+        const DeviceStream&       deviceStream)
+{
+    GMX_RELEASE_ASSERT(gridParams_, "Can not launch the kernel before setting its args");
+    GMX_RELEASE_ASSERT(atomParams_, "Can not launch the kernel before setting its args");
+    GMX_RELEASE_ASSERT(dynamicParams_, "Can not launch the kernel before setting its args");
+
+    using kernelNameType =
+            PmeGatherKernel<order, wrapX, wrapY, numGrids, readGlobal, threadsPerAtom, subGroupSize>;
+
+    // SYCL has different multidimensional layout than OpenCL/CUDA.
+    const cl::sycl::range<3> localSize{ config.blockSize[2], config.blockSize[1], config.blockSize[0] };
+    const cl::sycl::range<3> groupRange{ config.gridSize[2], config.gridSize[1], config.gridSize[0] };
+    const cl::sycl::nd_range<3> range{ groupRange * localSize, localSize };
+
+    cl::sycl::queue q = deviceStream.stream();
+
+    cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
+        auto kernel = pmeGatherKernel<order, wrapX, wrapY, numGrids, readGlobal, threadsPerAtom, subGroupSize>(
+                cgh,
+                atomParams_->nAtoms,
+                gridParams_->d_realGrid[0],
+                gridParams_->d_realGrid[1],
+                atomParams_->d_coefficients[0],
+                atomParams_->d_coefficients[1],
+                atomParams_->d_coordinates,
+                atomParams_->d_forces,
+                atomParams_->d_theta,
+                atomParams_->d_dtheta,
+                atomParams_->d_gridlineIndices,
+                gridParams_->d_fractShiftsTable,
+                gridParams_->d_gridlineIndicesTable,
+                gridParams_->tablesOffsets,
+                gridParams_->realGridSize,
+                gridParams_->realGridSizeFP,
+                gridParams_->realGridSizePadded,
+                dynamicParams_->recipBox[0],
+                dynamicParams_->recipBox[1],
+                dynamicParams_->recipBox[2],
+                dynamicParams_->scale);
+        cgh.parallel_for<kernelNameType>(range, kernel);
+    });
+
+    // Delete set args, so we don't forget to set them before the next launch.
+    reset();
+
+    return e;
+}
+
+
+template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+void PmeGatherKernel<order, wrapX, wrapY, numGrids, readGlobal, threadsPerAtom, subGroupSize>::reset()
+{
+    gridParams_    = nullptr;
+    atomParams_    = nullptr;
+    dynamicParams_ = nullptr;
+}
+
+
+//! Kernel instantiations
+/* Disable the "explicit template instantiation 'PmeSplineAndSpreadKernel<...>' will emit a vtable in every
+ * translation unit [-Wweak-template-vtables]" warning.
+ * It is only explicitly instantiated in this translation unit, so we should be safe.
+ */
+#ifdef __clang__
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wweak-template-vtables"
+#endif
+
+#define INSTANTIATE_3(order, numGrids, readGlobal, threadsPerAtom, subGroupSize) \
+    template class PmeGatherKernel<order, true, true, numGrids, readGlobal, threadsPerAtom, subGroupSize>;
+
+#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize)    \
+    INSTANTIATE_3(order, numGrids, true, threadsPerAtom, subGroupSize); \
+    INSTANTIATE_3(order, numGrids, false, threadsPerAtom, subGroupSize);
+
+#define INSTANTIATE(order, subGroupSize)                                 \
+    INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize);        \
+    INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \
+    INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize);        \
+    INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize);
+
+#if GMX_SYCL_DPCPP
+INSTANTIATE(4, 16); // TODO: Choose best value, Issue #4153.
+#elif GMX_SYCL_HIPSYCL
+INSTANTIATE(4, 32);
+INSTANTIATE(4, 64);
+#endif
+
+#ifdef __clang__
+#    pragma clang diagnostic pop
+#endif
diff --git a/src/gromacs/ewald/pme_gather_sycl.h b/src/gromacs/ewald/pme_gather_sycl.h
new file mode 100644
index 0000000000..c6b07f34ea
--- /dev/null
+++ b/src/gromacs/ewald/pme_gather_sycl.h
@@ -0,0 +1,66 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief Implements PME GPU gather in SYCL.
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ */
+
+#include "gromacs/gpu_utils/gmxsycl.h"
+
+#include "gromacs/gpu_utils/syclutils.h"
+
+#include "pme_grid.h"
+#include "pme_gpu_types_host.h"
+
+struct PmeGpuGridParams;
+struct PmeGpuAtomParams;
+struct PmeGpuDynamicParams;
+
+template<int order, bool wrapX, bool wrapY, int numGrids, bool readGlobal, ThreadsPerAtom threadsPerAtom, int subGroupSize>
+class PmeGatherKernel : public ISyclKernelFunctor
+{
+public:
+    PmeGatherKernel();
+    void            setArg(size_t argIndex, void* arg) override;
+    cl::sycl::event launch(const KernelLaunchConfig& config, const DeviceStream& deviceStream) override;
+
+private:
+    PmeGpuGridParams*    gridParams_;
+    PmeGpuAtomParams*    atomParams_;
+    PmeGpuDynamicParams* dynamicParams_;
+    void                 reset();
+};
diff --git a/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp b/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp
index 07da0a848c..196cff26ae 100644
--- a/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp
+++ b/src/gromacs/ewald/pme_gpu_program_impl_sycl.cpp
@@ -49,6 +49,7 @@
 #include "gromacs/gpu_utils/syclutils.h"
 
 #include "pme_gpu_program_impl.h"
+#include "pme_gather_sycl.h"
 #include "pme_spread_sycl.h"
 
 #include "pme_gpu_constants.h"
@@ -72,20 +73,32 @@ static int subGroupSizeFromVendor(const DeviceInformation& deviceInfo)
     }
 }
 
-#define INSTANTIATE_3(order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) \
+#define INSTANTIATE_SPREAD_2(                                                                      \
+        order, computeSplines, spreadCharges, numGrids, writeGlobal, threadsPerAtom, subGroupSize) \
     extern template class PmeSplineAndSpreadKernel<order, computeSplines, spreadCharges, true, true, numGrids, writeGlobal, threadsPerAtom, subGroupSize>;
 
-#define INSTANTIATE_2(order, numGrids, threadsPerAtom, subGroupSize)                 \
-    INSTANTIATE_3(order, true, true, numGrids, true, threadsPerAtom, subGroupSize);  \
-    INSTANTIATE_3(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \
-    INSTANTIATE_3(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \
-    INSTANTIATE_3(order, true, true, numGrids, false, threadsPerAtom, subGroupSize);
+#define INSTANTIATE_SPREAD(order, numGrids, threadsPerAtom, subGroupSize)                   \
+    INSTANTIATE_SPREAD_2(order, true, true, numGrids, true, threadsPerAtom, subGroupSize);  \
+    INSTANTIATE_SPREAD_2(order, true, false, numGrids, true, threadsPerAtom, subGroupSize); \
+    INSTANTIATE_SPREAD_2(order, false, true, numGrids, true, threadsPerAtom, subGroupSize); \
+    INSTANTIATE_SPREAD_2(order, true, true, numGrids, false, threadsPerAtom, subGroupSize);
 
-#define INSTANTIATE(order, subGroupSize)                                 \
-    INSTANTIATE_2(order, 1, ThreadsPerAtom::Order, subGroupSize);        \
-    INSTANTIATE_2(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \
-    INSTANTIATE_2(order, 2, ThreadsPerAtom::Order, subGroupSize);        \
-    INSTANTIATE_2(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize);
+#define INSTANTIATE_GATHER_2(order, numGrids, readGlobal, threadsPerAtom, subGroupSize) \
+    extern template class PmeGatherKernel<order, true, true, numGrids, readGlobal, threadsPerAtom, subGroupSize>;
+
+#define INSTANTIATE_GATHER(order, numGrids, threadsPerAtom, subGroupSize)      \
+    INSTANTIATE_GATHER_2(order, numGrids, true, threadsPerAtom, subGroupSize); \
+    INSTANTIATE_GATHER_2(order, numGrids, false, threadsPerAtom, subGroupSize);
+
+#define INSTANTIATE_X(x, order, subGroupSize)                              \
+    INSTANTIATE_##x(order, 1, ThreadsPerAtom::Order, subGroupSize);        \
+    INSTANTIATE_##x(order, 1, ThreadsPerAtom::OrderSquared, subGroupSize); \
+    INSTANTIATE_##x(order, 2, ThreadsPerAtom::Order, subGroupSize);        \
+    INSTANTIATE_##x(order, 2, ThreadsPerAtom::OrderSquared, subGroupSize);
+
+#define INSTANTIATE(order, subGroupSize)        \
+    INSTANTIATE_X(SPREAD, order, subGroupSize); \
+    INSTANTIATE_X(GATHER, order, subGroupSize);
 
 #if GMX_SYCL_DPCPP
 INSTANTIATE(4, 16);
@@ -135,6 +148,22 @@ static void setKernelPointers(struct PmeGpuProgramImpl* pmeGpuProgram)
             new PmeSplineAndSpreadKernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
     pmeGpuProgram->spreadKernelThPerAtom4Dual =
             new PmeSplineAndSpreadKernel<c_pmeOrder, false, true, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->gatherKernelSingle =
+            new PmeGatherKernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->gatherKernelThPerAtom4Single =
+            new PmeGatherKernel<c_pmeOrder, c_wrapX, c_wrapY, 1, false, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->gatherKernelReadSplinesSingle =
+            new PmeGatherKernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->gatherKernelReadSplinesThPerAtom4Single =
+            new PmeGatherKernel<c_pmeOrder, c_wrapX, c_wrapY, 1, true, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->gatherKernelDual =
+            new PmeGatherKernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->gatherKernelThPerAtom4Dual =
+            new PmeGatherKernel<c_pmeOrder, c_wrapX, c_wrapY, 2, false, ThreadsPerAtom::Order, subGroupSize>();
+    pmeGpuProgram->gatherKernelReadSplinesDual =
+            new PmeGatherKernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::OrderSquared, subGroupSize>();
+    pmeGpuProgram->gatherKernelReadSplinesThPerAtom4Dual =
+            new PmeGatherKernel<c_pmeOrder, c_wrapX, c_wrapY, 2, true, ThreadsPerAtom::Order, subGroupSize>();
 }
 
 PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
diff --git a/src/gromacs/ewald/tests/pmegathertest.cpp b/src/gromacs/ewald/tests/pmegathertest.cpp
index d1d3ffcc7f..444ff904cf 100644
--- a/src/gromacs/ewald/tests/pmegathertest.cpp
+++ b/src/gromacs/ewald/tests/pmegathertest.cpp
@@ -254,7 +254,14 @@ public:
             }
             s_inputAtomDataSets_[atomCount] = atomData;
         }
-        s_pmeTestHardwareContexts = createPmeTestHardwareContextList();
+        s_pmeTestHardwareContexts    = createPmeTestHardwareContextList();
+        g_allowPmeWithSyclForTesting = true; // We support PmeGather with SYCL
+    }
+
+    static void TearDownTestSuite()
+    {
+        // Revert the value back.
+        g_allowPmeWithSyclForTesting = false;
     }
 
     //! The test
-- 
2.22.0