From: Andrey Alekseenko <al42and@gmail.com>
Date: Thu, 18 Feb 2021 11:51:28 +0000 (+0000)
Subject: SYCL NBNXM offload support
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=b8fb87a0c5d65014d97e618241a0075875a81199;p=alexxy%2Fgromacs.git

SYCL NBNXM offload support

Associated changes:

- Added function stubs to PME: necessary for compilation.
- Stricter SYCL hardware compatibility checks: limits on subgroup size
  and the availability of local memory.
- The kernel implementation and overall logic closely follow the OpenCL
  implementation. Divergences are documented locally.

Limitations:

- No fine-grained timings yet.
- Code-duplication with CUDA and OpenCL: see #2608.
- Minor differences in local/nonlocal synchronization: see #3895,
  related to #2608.
- Only the OpenCL backend was extensively tested. LevelZero works fine
  without MPI but stalls due to a known bug. The fix for DPCPP runtime
  is available, but not yet part of any OneAPI release:
  https://github.com/intel/llvm/pull/3045.
- The complex/position-restraints regression test fails: see #3846.
- No performance tuning: see #3847.

Performance on rnase-cubic system is similar to OpenCL implementation.
---

diff --git a/src/gromacs/ewald/CMakeLists.txt b/src/gromacs/ewald/CMakeLists.txt
index 5699d52651..ac50cc1a48 100644
--- a/src/gromacs/ewald/CMakeLists.txt
+++ b/src/gromacs/ewald/CMakeLists.txt
@@ -2,7 +2,7 @@
 # This file is part of the GROMACS molecular simulation package.
 #
 # Copyright (c) 2014,2015,2016,2017,2018 by the GROMACS development team.
-# Copyright (c) 2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -76,7 +76,7 @@ if (GMX_GPU_CUDA)
         # Must add these files so they can include device_information.h
         pme_gpu_internal.cpp
         pme_gpu_timings.cpp
-	)
+        )
 elseif (GMX_GPU_OPENCL)
     gmx_add_libgromacs_sources(
         # OpenCL-specific sources
@@ -90,11 +90,18 @@ elseif (GMX_GPU_OPENCL)
 elseif (GMX_GPU_SYCL)
     # SYCL-TODO: proper implementation
     gmx_add_libgromacs_sources(
-        pme_gpu_program_impl.cpp
+        # Files that implement stubs
+        pme_gpu_sycl_stubs.cpp
+        # GPU-specific sources
+        pme_gpu.cpp
+        pme_gpu_internal.cpp
+        pme_gpu_timings.cpp
         )
     _gmx_add_files_to_property(SYCL_SOURCES
-        pme_gpu_program_impl.cpp
+        pme_gpu_internal.cpp
         pme_gpu_program.cpp
+        pme_gpu_sycl_stubs.cpp
+        pme_gpu_timings.cpp
         )
 else()
     gmx_add_libgromacs_sources(
diff --git a/src/gromacs/ewald/pme_gpu_3dfft.h b/src/gromacs/ewald/pme_gpu_3dfft.h
index a39f751bab..d71e43522c 100644
--- a/src/gromacs/ewald/pme_gpu_3dfft.h
+++ b/src/gromacs/ewald/pme_gpu_3dfft.h
@@ -1,7 +1,8 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -56,6 +57,8 @@
 
 #    include "gromacs/gpu_utils/gmxopencl.h"
 #    include "gromacs/gpu_utils/gputraits_ocl.h"
+#elif GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/gputraits_sycl.h"
 #endif
 
 #include "gromacs/fft/fft.h" // for the enum gmx_fft_direction
diff --git a/src/gromacs/ewald/pme_gpu_internal.cpp b/src/gromacs/ewald/pme_gpu_internal.cpp
index e78373ec8b..0d7ee39e23 100644
--- a/src/gromacs/ewald/pme_gpu_internal.cpp
+++ b/src/gromacs/ewald/pme_gpu_internal.cpp
@@ -1,7 +1,8 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018,2019,2020, by the GROMACS development team.
+ * Copyright (c) 2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -75,6 +76,8 @@
 #    include "pme.cuh"
 #elif GMX_GPU_OPENCL
 #    include "gromacs/gpu_utils/gmxopencl.h"
+#elif GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/syclutils.h"
 #endif
 
 #include "gromacs/ewald/pme.h"
@@ -468,7 +471,7 @@ void pme_gpu_free_fract_shifts(const PmeGpu* pmeGpu)
                             kernelParamsPtr->fractShiftsTableTexture);
     destroyParamLookupTable(&kernelParamsPtr->grid.d_gridlineIndicesTable,
                             kernelParamsPtr->gridlineIndicesTableTexture);
-#elif GMX_GPU_OPENCL
+#elif GMX_GPU_OPENCL || GMX_GPU_SYCL
     freeDeviceBuffer(&kernelParamsPtr->grid.d_fractShiftsTable);
     freeDeviceBuffer(&kernelParamsPtr->grid.d_gridlineIndicesTable);
 #endif
diff --git a/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp b/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp
new file mode 100644
index 0000000000..c6f6c1c303
--- /dev/null
+++ b/src/gromacs/ewald/pme_gpu_sycl_stubs.cpp
@@ -0,0 +1,79 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ * \brief Implements stubs of high-level PME GPU functions for SYCL.
+ *
+ * \author Andrey Alekseenko <al42and@gmail.com>
+ *
+ * \ingroup module_ewald
+ */
+#include "gmxpre.h"
+
+#include "gromacs/ewald/ewald_utils.h"
+
+#include "pme_gpu_3dfft.h"
+#include "pme_gpu_internal.h"
+#include "pme_gpu_program_impl.h"
+
+PmeGpuProgramImpl::PmeGpuProgramImpl(const DeviceContext& deviceContext) :
+    deviceContext_(deviceContext),
+    warpSize_(0),
+    spreadWorkGroupSize(0),
+    gatherWorkGroupSize(0),
+    solveMaxWorkGroupSize(0)
+{
+    // SYCL-TODO
+}
+
+PmeGpuProgramImpl::~PmeGpuProgramImpl() = default;
+
+// [[noreturn]] attributes must be added in the common headers, so it's easier to silence the warning here
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmissing-noreturn"
+
+GpuParallel3dFft::GpuParallel3dFft(PmeGpu const* /*pmeGpu*/, int /*gridIndex*/)
+{
+    GMX_THROW(gmx::NotImplementedError("PME is not implemented in SYCL"));
+}
+
+GpuParallel3dFft::~GpuParallel3dFft() = default;
+
+void GpuParallel3dFft::perform3dFft(gmx_fft_direction /*dir*/, CommandEvent* /*timingEvent*/)
+{
+    GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet"));
+}
+
+#pragma clang diagnostic pop
diff --git a/src/gromacs/ewald/pme_gpu_types_host.h b/src/gromacs/ewald/pme_gpu_types_host.h
index 26405433f5..055e61e5d8 100644
--- a/src/gromacs/ewald/pme_gpu_types_host.h
+++ b/src/gromacs/ewald/pme_gpu_types_host.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -78,7 +78,7 @@ typedef int PmeGpuSpecific;
 struct PmeGpuCudaKernelParams;
 /*! \brief A typedef for including the GPU kernel arguments data by pointer */
 typedef PmeGpuCudaKernelParams PmeGpuKernelParams;
-#elif GMX_GPU_OPENCL
+#elif GMX_GPU_OPENCL || GMX_GPU_SYCL
 struct PmeGpuKernelParamsBase;
 /*! \brief A typedef for including the GPU kernel arguments data by pointer */
 typedef PmeGpuKernelParamsBase PmeGpuKernelParams;
diff --git a/src/gromacs/ewald/pme_gpu_types_host_impl.h b/src/gromacs/ewald/pme_gpu_types_host_impl.h
index cf98d701c5..976e064cb2 100644
--- a/src/gromacs/ewald/pme_gpu_types_host_impl.h
+++ b/src/gromacs/ewald/pme_gpu_types_host_impl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -56,6 +56,9 @@
 #elif GMX_GPU_OPENCL
 #    include "gromacs/gpu_utils/gpueventsynchronizer_ocl.h"
 #    include "gromacs/gpu_utils/gpuregiontimer_ocl.h"
+#elif GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
+#    include "gromacs/gpu_utils/gpuregiontimer_sycl.h"
 #endif
 
 #include "gromacs/timing/gpu_timing.h" // for gtPME_EVENT_COUNT
diff --git a/src/gromacs/gpu_utils/CMakeLists.txt b/src/gromacs/gpu_utils/CMakeLists.txt
index 7958f18c13..3b59b9279e 100644
--- a/src/gromacs/gpu_utils/CMakeLists.txt
+++ b/src/gromacs/gpu_utils/CMakeLists.txt
@@ -1,7 +1,8 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2015,2016,2017,2018,2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2015,2016,2017,2018,2019, by the GROMACS development team.
+# Copyright (c) 2020,2021, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -62,12 +63,13 @@ elseif(GMX_GPU_CUDA)
         )
     _gmx_add_files_to_property(CUDA_SOURCES
         device_stream_manager.cpp
-	)
+        )
 elseif(GMX_GPU_SYCL)
     gmx_add_libgromacs_sources(
         devicebuffer_sycl.cpp
         device_context_sycl.cpp
         device_stream_sycl.cpp
+        syclutils.cpp
         )
     _gmx_add_files_to_property(SYCL_SOURCES
         devicebuffer_sycl.cpp
@@ -75,6 +77,7 @@ elseif(GMX_GPU_SYCL)
         device_context_sycl.cpp
         device_stream_manager.cpp
         device_stream_sycl.cpp
+        syclutils.cpp
         )
 else()
     gmx_add_libgromacs_sources(
diff --git a/src/gromacs/gpu_utils/gpu_macros.h b/src/gromacs/gpu_utils/gpu_macros.h
index 1bfbefd0a7..bf57d57d3b 100644
--- a/src/gromacs/gpu_utils/gpu_macros.h
+++ b/src/gromacs/gpu_utils/gpu_macros.h
@@ -1,7 +1,8 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team.
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -73,11 +74,15 @@
 #    define OPENCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
 #    define OPENCL_FUNC_TERM REAL_FUNC_TERM
 #    define OPENCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#    define SYCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#    define SYCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#    define SYCL_FUNC_TERM REAL_FUNC_TERM
+#    define SYCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
 
 #else // Not DOXYGEN
 
 /* GPU support is enabled, so these functions will have real code defined somewhere */
-#    if GMX_GPU && !GMX_GPU_SYCL
+#    if GMX_GPU
 #        define GPU_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
 #        define GPU_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
 #        define GPU_FUNC_TERM REAL_FUNC_TERM
@@ -87,7 +92,6 @@
 #        define GPU_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
 #        define GPU_FUNC_TERM NULL_FUNC_TERM
 #        define GPU_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
-#
 #    endif
 
 /* Enable and disable platform-specific function implementations */
@@ -115,6 +119,18 @@
 #        define CUDA_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
 #    endif
 
+#    if GMX_GPU_SYCL
+#        define SYCL_FUNC_QUALIFIER REAL_FUNC_QUALIFIER
+#        define SYCL_FUNC_ARGUMENT REAL_FUNC_ARGUMENT
+#        define SYCL_FUNC_TERM REAL_FUNC_TERM
+#        define SYCL_FUNC_TERM_WITH_RETURN(arg) REAL_FUNC_TERM_WITH_RETURN(arg)
+#    else
+#        define SYCL_FUNC_QUALIFIER NULL_FUNC_QUALIFIER
+#        define SYCL_FUNC_ARGUMENT NULL_FUNC_ARGUMENT
+#        define SYCL_FUNC_TERM NULL_FUNC_TERM
+#        define SYCL_FUNC_TERM_WITH_RETURN(arg) NULL_FUNC_TERM_WITH_RETURN(arg)
+#    endif
+
 #endif // ifdef DOXYGEN
 
 #endif // GMX_GPU_UTILS_MACROS_H
diff --git a/src/gromacs/gpu_utils/gpu_utils.cpp b/src/gromacs/gpu_utils/gpu_utils.cpp
index eb26ce0a8e..663137084f 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cpp
+++ b/src/gromacs/gpu_utils/gpu_utils.cpp
@@ -1,7 +1,8 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2014,2015,2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2014,2015,2017,2018,2019, by the GROMACS development team.
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -43,10 +44,7 @@
 
 #include "config.h"
 
-#include <cassert>
-
 #include "gromacs/utility/arrayref.h"
-#include "gromacs/utility/smalloc.h"
 #include "gromacs/utility/stringutil.h"
 
 #ifdef _MSC_VER
@@ -79,9 +77,5 @@ bool buildSupportsNonbondedOnGpu(std::string* error)
     {
         errorReasons.emplace_back("non-GPU build of GROMACS");
     }
-    if (GMX_GPU_SYCL)
-    {
-        errorReasons.emplace_back("SYCL build of GROMACS");
-    }
     return addMessageIfNotSupported(errorReasons, error);
 }
diff --git a/src/gromacs/gpu_utils/gputraits_sycl.h b/src/gromacs/gpu_utils/gputraits_sycl.h
index 52e40bb836..d53552e3c2 100644
--- a/src/gromacs/gpu_utils/gputraits_sycl.h
+++ b/src/gromacs/gpu_utils/gputraits_sycl.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -47,6 +47,7 @@
 #include <cstddef>
 
 #include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/math/vectypes.h"
 
 using DeviceTexture = void*;
 
diff --git a/src/gromacs/gpu_utils/syclutils.cpp b/src/gromacs/gpu_utils/syclutils.cpp
new file mode 100644
index 0000000000..0f660c04f5
--- /dev/null
+++ b/src/gromacs/gpu_utils/syclutils.cpp
@@ -0,0 +1,78 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ *  \brief Define utility routines for SYCL
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ */
+#include "gmxpre.h"
+
+#include "syclutils.h"
+
+#include "gromacs/utility/smalloc.h"
+
+/*! \brief Allocates \p nbytes of host memory. Use \c pfree to free memory allocated with this function.
+ *
+ *  \todo
+ *  This function was copied from OpenCL implementation, not tuned for SYCL at all.
+ *  Once SYCL2020 is out, might be worthwhile to look into USM and sycl::malloc_host / sycl::aligned_alloc_host.
+ *  Overall, it is better to directly use sycl::buffer instead of pinned arrays. But this function
+ *  is needed to compile some PME code with SYCL enabled, even if it is never used.
+ *
+ * \param[in,out]    h_ptr   Pointer where to store the address of the newly allocated buffer.
+ * \param[in]        nbytes  Size in bytes of the buffer to be allocated.
+ */
+void pmalloc(void** h_ptr, size_t nbytes)
+{
+    /* Need a temporary type whose size is 1 byte, so that the
+     * implementation of snew_aligned can cope without issuing
+     * warnings. */
+    auto** temporary = reinterpret_cast<std::byte**>(h_ptr);
+
+    /* 16-byte alignment inherited from OpenCL and does not sound unreasonable */
+    snew_aligned(*temporary, nbytes, 16);
+}
+
+/*! \brief Frees memory allocated with pmalloc.
+ *
+ * \param[in]    h_ptr   Buffer allocated with pmalloc that needs to be freed.
+ */
+void pfree(void* h_ptr)
+{
+    if (h_ptr)
+    {
+        sfree_aligned(h_ptr);
+    }
+}
diff --git a/src/gromacs/gpu_utils/syclutils.h b/src/gromacs/gpu_utils/syclutils.h
new file mode 100644
index 0000000000..1318a2a9b4
--- /dev/null
+++ b/src/gromacs/gpu_utils/syclutils.h
@@ -0,0 +1,129 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \libinternal \file
+ *  \brief Declare utility routines for SYCL
+ *
+ *  \author Andrey Alekseenko <al42and@gmail.com>
+ *  \inlibraryapi
+ */
+#ifndef GMX_GPU_UTILS_SYCLUTILS_H
+#define GMX_GPU_UTILS_SYCLUTILS_H
+
+#include <string>
+
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/gmxassert.h"
+#include "gromacs/utility/stringutil.h"
+
+class DeviceStream;
+enum class GpuApiCallBehavior;
+
+/*! \internal
+ * \brief SYCL GPU runtime data
+ *
+ * The device runtime data is meant to hold objects associated with a GROMACS rank's
+ * (thread or process) use of a single device (multiple devices per rank is not
+ * implemented). These objects should be constructed at the point where a device
+ * gets assigned to a rank and released at when this assignment is no longer valid
+ * (i.e. at cleanup in the current implementation).
+ */
+struct gmx_device_runtime_data_t
+{
+};
+
+#ifndef DOXYGEN
+
+/*! \brief Allocate host memory in malloc style */
+void pmalloc(void** h_ptr, size_t nbytes);
+
+/*! \brief Free host memory in malloc style */
+void pfree(void* h_ptr);
+
+/* To properly mark function as [[noreturn]], we must do it everywhere it is declared, which
+ * will pollute common headers.*/
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wmissing-noreturn"
+
+/*! \brief
+ * A wrapper function for setting up all the SYCL kernel arguments.
+ * Calls the recursive functions above.
+ *
+ * \tparam    Args            Types of all the kernel arguments
+ * \param[in] kernel          Kernel function handle
+ * \param[in] config          Kernel configuration for launching
+ * \param[in] argsPtrs        Pointers to all the kernel arguments
+ * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument.
+ */
+template<typename... Args>
+void* prepareGpuKernelArguments(void* /*kernel*/, const KernelLaunchConfig& /*config*/, const Args*... /*argsPtrs*/)
+{
+    GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet"));
+}
+
+/*! \brief Launches the SYCL kernel and handles the errors.
+ *
+ * \param[in] kernel          Kernel function handle
+ * \param[in] config          Kernel configuration for launching
+ * \param[in] deviceStream    GPU stream to launch kernel in
+ * \param[in] timingEvent     Timing event, fetched from GpuRegionTimer
+ * \param[in] kernelName      Human readable kernel description, for error handling only
+ * \throws gmx::InternalError on kernel launch failure
+ */
+inline void launchGpuKernel(void* /*kernel*/,
+                            const KernelLaunchConfig& /*config*/,
+                            const DeviceStream& /*deviceStream*/,
+                            CommandEvent* /*timingEvent*/,
+                            const char* /*kernelName*/,
+                            const void* /*kernelArgs*/)
+{
+    GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet"));
+}
+
+/*! \brief Pretend to check a SYCL stream for unfinished work (dummy implementation).
+ *
+ *  \returns  Not implemented in SYCL.
+ */
+static inline bool haveStreamTasksCompleted(const DeviceStream& /* deviceStream */)
+{
+    GMX_THROW(gmx::NotImplementedError("Not implemented on SYCL yet"));
+}
+
+#    pragma clang diagnostic pop
+
+#endif // !DOXYGEN
+
+#endif
diff --git a/src/gromacs/hardware/device_management_sycl.cpp b/src/gromacs/hardware/device_management_sycl.cpp
index 93021e418f..4fd401e04b 100644
--- a/src/gromacs/hardware/device_management_sycl.cpp
+++ b/src/gromacs/hardware/device_management_sycl.cpp
@@ -92,14 +92,43 @@ static DeviceStatus isDeviceCompatible(const cl::sycl::device& syclDevice)
         return DeviceStatus::Compatible;
     }
 
-    if (syclDevice.is_accelerator()) // FPGAs and FPGA emulators
+    if (syclDevice.get_info<cl::sycl::info::device::local_mem_type>() == cl::sycl::info::local_mem_type::none)
     {
+        // While some kernels (leapfrog) can run without shared/local memory, this is a bad sign
         return DeviceStatus::Incompatible;
     }
-    else
+
+    if (syclDevice.is_host())
+    {
+        // Host device does not support subgroups or even querying for sub_group_sizes
+        return DeviceStatus::Incompatible;
+    }
+    const std::vector<size_t> supportedSubGroupSizes =
+            syclDevice.get_info<cl::sycl::info::device::sub_group_sizes>();
+    const size_t requiredSubGroupSizeForNBNXM = 8;
+    if (std::find(supportedSubGroupSizes.begin(), supportedSubGroupSizes.end(), requiredSubGroupSizeForNBNXM)
+        == supportedSubGroupSizes.end())
+    {
+        return DeviceStatus::IncompatibleClusterSize;
+    }
+
+    /* Host device can not be used, because NBNXM requires sub-groups, which are not supported.
+     * Accelerators (FPGAs and their emulators) are not supported.
+     * So, the only viable options are CPUs and GPUs. */
+    const bool forceCpu = (getenv("GMX_SYCL_FORCE_CPU") != nullptr);
+
+    if (forceCpu && syclDevice.is_cpu())
+    {
+        return DeviceStatus::Compatible;
+    }
+    else if (!forceCpu && syclDevice.is_gpu())
     {
         return DeviceStatus::Compatible;
     }
+    else
+    {
+        return DeviceStatus::Incompatible;
+    }
 }
 
 
diff --git a/src/gromacs/listed_forces/gpubonded_impl.cpp b/src/gromacs/listed_forces/gpubonded_impl.cpp
index ff62325723..ebe0f95085 100644
--- a/src/gromacs/listed_forces/gpubonded_impl.cpp
+++ b/src/gromacs/listed_forces/gpubonded_impl.cpp
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -124,6 +124,10 @@ bool buildSupportsGpuBondeds(std::string* error)
     {
         errorReasons.emplace_back("not supported with OpenCL build of GROMACS");
     }
+    if (GMX_GPU_SYCL)
+    {
+        errorReasons.emplace_back("not supported with SYCL build of GROMACS");
+    }
     else if (!GMX_GPU)
     {
         errorReasons.emplace_back("not supported with CPU-only build of GROMACS");
diff --git a/src/gromacs/nbnxm/CMakeLists.txt b/src/gromacs/nbnxm/CMakeLists.txt
index bca3f88df9..91738e6235 100644
--- a/src/gromacs/nbnxm/CMakeLists.txt
+++ b/src/gromacs/nbnxm/CMakeLists.txt
@@ -1,7 +1,7 @@
 #
 # This file is part of the GROMACS molecular simulation package.
 #
-# Copyright (c) 2019,2020, by the GROMACS development team, led by
+# Copyright (c) 2019,2020,2021, by the GROMACS development team, led by
 # Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
 # and including many others, as listed in the AUTHORS file in the
 # top-level source directory and at http://www.gromacs.org.
@@ -69,7 +69,7 @@ if(GMX_GPU_CUDA)
     gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
     _gmx_add_files_to_property(CUDA_SOURCES
         nbnxm_gpu_data_mgmt.cpp
-	)
+        )
 endif()
 
 if(GMX_GPU_OPENCL)
@@ -78,6 +78,12 @@ if(GMX_GPU_OPENCL)
     gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
 endif()
 
+if(GMX_GPU_SYCL)
+    add_subdirectory(sycl)
+    gmx_add_libgromacs_sources(nbnxm_gpu_data_mgmt.cpp)
+    _gmx_add_files_to_property(SYCL_SOURCES nbnxm_gpu_data_mgmt.cpp nbnxm.cpp)
+endif()
+
 set(LIBGROMACS_SOURCES ${LIBGROMACS_SOURCES} ${NBNXM_SOURCES} PARENT_SCOPE)
 
 
@@ -103,4 +109,4 @@ target_link_libraries(nbnxm INTERFACE
 #target_link_libraries(nbnxm PUBLIC
 target_link_libraries(nbnxm INTERFACE
         utility
-        )
\ No newline at end of file
+        )
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index 9b383d2ec6..bb1eb874ae 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -55,6 +55,10 @@
 #    include "opencl/nbnxm_ocl_types.h"
 #endif
 
+#if GMX_GPU_SYCL
+#    include "sycl/nbnxm_sycl_types.h"
+#endif
+
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/listed_forces/gpubonded.h"
 #include "gromacs/math/vec.h"
@@ -246,8 +250,7 @@ static void countPruneKernelTime(GpuTimers*                 timers,
  * \param[out] e_el           Variable to accumulate electrostatic energy into
  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
  */
-template<typename StagingData>
-static inline void gpu_reduce_staged_outputs(const StagingData&        nbst,
+static inline void gpu_reduce_staged_outputs(const nb_staging_t&       nbst,
                                              const InteractionLocality iLocality,
                                              const bool                reduceEnergies,
                                              const bool                reduceFshift,
@@ -435,9 +438,12 @@ bool gpu_try_finish_task(NbnxmGpu*                nb,
         }
     }
 
-    /* Always reset both pruning flags (doesn't hurt doing it even when timing is off). */
-    nb->timers->interaction[iLocality].didPrune =
-            nb->timers->interaction[iLocality].didRollingPrune = false;
+    /* Reset both pruning flags. */
+    if (nb->bDoTime)
+    {
+        nb->timers->interaction[iLocality].didPrune =
+                nb->timers->interaction[iLocality].didRollingPrune = false;
+    }
 
     /* Turn off initial list pruning (doesn't hurt if this is not pair-search step). */
     nb->plist[iLocality]->haveFreshList = false;
diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h
index 2d2e90791e..26cd3c165f 100644
--- a/src/gromacs/nbnxm/gpu_types_common.h
+++ b/src/gromacs/nbnxm/gpu_types_common.h
@@ -44,6 +44,7 @@
 
 #include "config.h"
 
+#include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/locality.h"
 #include "gromacs/utility/enumerationhelpers.h"
 
@@ -58,6 +59,10 @@
 #    include "gromacs/gpu_utils/gpuregiontimer.cuh"
 #endif
 
+#if GMX_GPU_SYCL
+#    include "gromacs/gpu_utils/gpuregiontimer_sycl.h"
+#endif
+
 /** \internal
  * \brief Parameters required for the GPU nonbonded calculations.
  */
@@ -77,7 +82,7 @@ struct NBParamGpu
     float two_k_rf;
     //! Ewald/PME parameter
     float ewald_beta;
-    //! Ewald/PME correction term substracted from the direct-space potential
+    //! Ewald/PME correction term subtracted from the direct-space potential
     float sh_ewald;
     //! LJ-Ewald/PME correction term added to the correction potential
     float sh_lj_ewald;
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
index 9fe755c586..f811eee3e5 100644
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -56,6 +56,10 @@
 #    include "opencl/nbnxm_ocl_types.h"
 #endif
 
+#if GMX_GPU_SYCL
+#    include "sycl/nbnxm_sycl_types.h"
+#endif
+
 #include "nbnxm_gpu_data_mgmt.h"
 
 #include "gromacs/hardware/device_information.h"
@@ -63,6 +67,7 @@
 #include "gromacs/nbnxm/gpu_data_mgmt.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/fatalerror.h"
 
 #include "nbnxm_gpu.h"
 #include "pairlistsets.h"
diff --git a/src/gromacs/nbnxm/pairlistparams.h b/src/gromacs/nbnxm/pairlistparams.h
index 2783ae454d..72bed6748b 100644
--- a/src/gromacs/nbnxm/pairlistparams.h
+++ b/src/gromacs/nbnxm/pairlistparams.h
@@ -62,6 +62,8 @@ static constexpr int c_nbnxnCpuIClusterSize = 4;
 //! The i- and j-cluster size for GPU lists, 8 atoms for CUDA, set at compile time for OpenCL
 #if GMX_GPU_OPENCL
 static constexpr int c_nbnxnGpuClusterSize = GMX_OPENCL_NB_CLUSTER_SIZE;
+#elif GMX_GPU_SYCL
+static constexpr int c_nbnxnGpuClusterSize = 4;
 #else
 static constexpr int c_nbnxnGpuClusterSize = 8;
 #endif
diff --git a/src/gromacs/nbnxm/sycl/CMakeLists.txt b/src/gromacs/nbnxm/sycl/CMakeLists.txt
new file mode 100644
index 0000000000..4a6cc29b19
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/CMakeLists.txt
@@ -0,0 +1,39 @@
+#
+# This file is part of the GROMACS molecular simulation package.
+#
+# Copyright (c) 2020,2021, by the GROMACS development team, led by
+# Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+# and including many others, as listed in the AUTHORS file in the
+# top-level source directory and at http://www.gromacs.org.
+#
+# GROMACS is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public License
+# as published by the Free Software Foundation; either version 2.1
+# of the License, or (at your option) any later version.
+#
+# GROMACS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with GROMACS; if not, see
+# http://www.gnu.org/licenses, or write to the Free Software Foundation,
+# Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+#
+# If you want to redistribute modifications to GROMACS, please
+# consider that scientific software is very special. Version
+# control is crucial - bugs must be traceable. We will be happy to
+# consider code for inclusion in the official distribution, but
+# derived work must not be called official GROMACS. Details are found
+# in the README & COPYING files - if they are missing, get the
+# official version at http://www.gromacs.org.
+#
+# To help us fund GROMACS development, we humbly ask that you cite
+# the research papers on the package. Check out http://www.gromacs.org.
+
+if (GMX_GPU_SYCL)
+    file(GLOB NBNXM_SYCL_SOURCES *.cpp)
+    set(NBNXM_SOURCES ${NBNXM_SOURCES} ${NBNXM_SYCL_SOURCES} PARENT_SCOPE)
+    _gmx_add_files_to_property(SYCL_SOURCES ${NBNXM_SYCL_SOURCES})
+endif()
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
new file mode 100644
index 0000000000..c4a2d3f725
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
@@ -0,0 +1,269 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief
+ *  Data management and kernel launch functions for nbnxm sycl.
+ *
+ *  \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "gromacs/nbnxm/gpu_common.h"
+#include "gromacs/utility/exceptions.h"
+
+#include "nbnxm_sycl_kernel.h"
+#include "nbnxm_sycl_kernel_pruneonly.h"
+#include "nbnxm_sycl_types.h"
+
+namespace Nbnxm
+{
+
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void gpu_launch_cpyback(NbnxmGpu*                nb,
+                        struct nbnxn_atomdata_t* nbatom,
+                        const gmx::StepWorkload& stepWork,
+                        const AtomLocality       atomLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    const InteractionLocality iloc         = gpuAtomToInteractionLocality(atomLocality);
+    const DeviceStream&       deviceStream = *nb->deviceStreams[iloc];
+    sycl_atomdata_t*          adat         = nb->atdat;
+
+    /* don't launch non-local copy-back if there was no non-local work to do */
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+    {
+        nb->bNonLocalStreamActive = false;
+        return;
+    }
+
+    int adatBegin, adatLen;
+    getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
+
+    // With DD the local D2H transfer can only start after the non-local kernel has finished.
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
+    {
+        nb->nonlocal_done.waitForEvent();
+    }
+
+    /* DtoH f
+     * Skip if buffer ops / reduction is offloaded to the GPU.
+     */
+    if (!stepWork.useGpuFBufferOps)
+    {
+        GMX_ASSERT(adat->f.elementSize() == sizeof(float3),
+                   "The size of the force buffer element should be equal to the size of float3.");
+        copyFromDeviceBuffer(reinterpret_cast<float3*>(nbatom->out[0].f.data()) + adatBegin,
+                             &adat->f,
+                             adatBegin,
+                             adatLen,
+                             deviceStream,
+                             GpuApiCallBehavior::Async,
+                             nullptr);
+    }
+
+    /* After the non-local D2H is launched the nonlocal_done event can be
+       recorded which signals that the local D2H can proceed. This event is not
+       placed after the non-local kernel because we want the non-local data
+       back first. */
+    if (iloc == InteractionLocality::NonLocal)
+    {
+        nb->nonlocal_done.markEvent(deviceStream);
+        nb->bNonLocalStreamActive = true;
+    }
+
+    /* only transfer energies in the local stream */
+    if (iloc == InteractionLocality::Local)
+    {
+        /* DtoH fshift when virial is needed */
+        if (stepWork.computeVirial)
+        {
+            GMX_ASSERT(sizeof(*nb->nbst.fshift) == adat->fShift.elementSize(),
+                       "Sizes of host- and device-side shift vector elements should be the same.");
+            copyFromDeviceBuffer(
+                    nb->nbst.fshift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
+        }
+
+        /* DtoH energies */
+        if (stepWork.computeEnergy)
+        {
+            GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
+                       "Sizes of host- and device-side LJ energy terms should be the same.");
+            copyFromDeviceBuffer(
+                    nb->nbst.e_lj, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+            GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
+                       "Sizes of host- and device-side electrostatic energy terms should be the "
+                       "same.");
+            copyFromDeviceBuffer(
+                    nb->nbst.e_el, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+        }
+    }
+}
+
+/*! \brief Launch asynchronously the xq buffer host to device copy. */
+void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+    validateGpuAtomLocality(atomLocality);
+
+    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
+
+    sycl_atomdata_t*    adat         = nb->atdat;
+    gpu_plist*          plist        = nb->plist[iloc];
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+    /* Don't launch the non-local H2D copy if there is no dependent
+       work to do: neither non-local nor other (e.g. bonded) work
+       to do that has as input the nbnxn coordinates.
+       Doing the same for the local kernel is more complicated, since the
+       local part of the force array also depends on the non-local kernel.
+       So to avoid complicating the code and to reduce the risk of bugs,
+       we always call the local local x+q copy (and the rest of the local
+       work in nbnxn_gpu_launch_kernel().
+     */
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
+    {
+        plist->haveFreshList = false;
+        return;
+    }
+
+    int adatBegin, adatLen;
+    getGpuAtomRange(adat, atomLocality, &adatBegin, &adatLen);
+
+    /* HtoD x, q */
+    GMX_ASSERT(adat->xq.elementSize() == sizeof(float4),
+               "The size of the xyzq buffer element should be equal to the size of float4.");
+    copyToDeviceBuffer(&adat->xq,
+                       reinterpret_cast<const float4*>(nbatom->x().data()) + adatBegin,
+                       adatBegin,
+                       adatLen,
+                       deviceStream,
+                       GpuApiCallBehavior::Async,
+                       nullptr);
+
+    /* No need to enforce stream synchronization with events like we do in CUDA/OpenCL.
+     * Runtime should do the scheduling correctly based on data dependencies. */
+}
+
+void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
+{
+    gpu_plist* plist = nb->plist[iloc];
+
+    if (plist->haveFreshList)
+    {
+        GMX_ASSERT(numParts == 1, "With first pruning we expect 1 part");
+
+        /* Set rollingPruningNumParts to signal that it is not set */
+        plist->rollingPruningNumParts = 0;
+        plist->rollingPruningPart     = 0;
+    }
+    else
+    {
+        if (plist->rollingPruningNumParts == 0)
+        {
+            plist->rollingPruningNumParts = numParts;
+        }
+        else
+        {
+            GMX_ASSERT(numParts == plist->rollingPruningNumParts,
+                       "It is not allowed to change numParts in between list generation steps");
+        }
+    }
+
+    /* Use a local variable for part and update in plist, so we can return here
+     * without duplicating the part increment code.
+     */
+    const int part = plist->rollingPruningPart;
+
+    plist->rollingPruningPart++;
+    if (plist->rollingPruningPart >= plist->rollingPruningNumParts)
+    {
+        plist->rollingPruningPart = 0;
+    }
+
+    /* Compute the number of list entries to prune in this pass */
+    const int numSciInPart = (plist->nsci - part) / numParts;
+
+    /* Don't launch the kernel if there is no work to do */
+    if (numSciInPart <= 0)
+    {
+        plist->haveFreshList = false;
+        return;
+    }
+
+    launchNbnxmKernelPruneOnly(nb, iloc, numParts, part, numSciInPart);
+
+    if (plist->haveFreshList)
+    {
+        plist->haveFreshList = false;
+        nb->didPrune[iloc]   = true; // Mark that pruning has been done
+    }
+    else
+    {
+        nb->didRollingPrune[iloc] = true; // Mark that rolling pruning has been done
+    }
+}
+
+void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
+{
+    const NBParamGpu* nbp   = nb->nbparam;
+    gpu_plist*        plist = nb->plist[iloc];
+
+    if (canSkipNonbondedWork(*nb, iloc))
+    {
+        plist->haveFreshList = false;
+        return;
+    }
+
+    if (nbp->useDynamicPruning && plist->haveFreshList)
+    {
+        // Prunes for rlistOuter and rlistInner, sets plist->haveFreshList=false
+        gpu_launch_kernel_pruneonly(nb, iloc, 1);
+    }
+
+    if (plist->nsci == 0)
+    {
+        /* Don't launch an empty local kernel */
+        return;
+    }
+
+    launchNbnxmKernel(nb, stepWork, iloc);
+}
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
new file mode 100644
index 0000000000..e35b974c3a
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
@@ -0,0 +1,354 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief
+ *  Stubs of functions that must be defined by nbnxm sycl implementation.
+ *
+ *  \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "gromacs/gpu_utils/device_stream_manager.h"
+#include "gromacs/hardware/device_information.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/nbnxm/atomdata.h"
+#include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/nbnxm_gpu.h"
+#include "gromacs/nbnxm/nbnxm_gpu_data_mgmt.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/exceptions.h"
+
+#include "nbnxm_sycl_types.h"
+
+namespace Nbnxm
+{
+
+//! This function is documented in the header file
+void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
+{
+    sycl_atomdata_t*    adat        = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+    // Clear forces
+    clearDeviceBufferAsync(&adat->f, 0, nb->atdat->natoms, localStream);
+    // Clear shift force array and energies if the outputs were used in the current step
+    if (computeVirial)
+    {
+        clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
+        clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
+        clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
+    }
+}
+
+/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
+static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext)
+{
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+    sycl_atomdata_t*    atomdata    = nb->atdat;
+    atomdata->numTypes              = numTypes;
+    allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
+    atomdata->shiftVecUploaded = false;
+
+    allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
+    allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
+    allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
+
+    clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
+    clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
+    clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
+
+    /* initialize to nullptr pointers to data that is not allocated here and will
+       need reallocation in later */
+    atomdata->xq = nullptr;
+    atomdata->f  = nullptr;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    atomdata->natoms   = -1;
+    atomdata->numAlloc = -1;
+}
+
+/*! \brief Initialize the nonbonded parameter data structure. */
+static void initNbparam(NBParamGpu*                     nbp,
+                        const interaction_const_t&      ic,
+                        const PairlistParams&           listParams,
+                        const nbnxn_atomdata_t::Params& nbatParams,
+                        const DeviceContext&            deviceContext)
+{
+    const int numTypes = nbatParams.numTypes;
+
+    set_cutoff_parameters(nbp, &ic, listParams);
+
+    nbp->vdwType  = nbnxmGpuPickVdwKernelType(&ic, nbatParams.ljCombinationRule);
+    nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(&ic, deviceContext.deviceInfo());
+
+    /* generate table for PME */
+    nbp->coulomb_tab = nullptr;
+    if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
+    {
+        GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+        init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
+    }
+
+    /* set up LJ parameter lookup table */
+    if (!useLjCombRule(nbp->vdwType))
+    {
+        initParamLookupTable(
+                &nbp->nbfp, &nbp->nbfp_texobj, nbatParams.nbfp.data(), 2 * numTypes * numTypes, deviceContext);
+    }
+
+    /* set up LJ-PME parameter lookup table */
+    if (ic.vdwtype == evdwPME)
+    {
+        initParamLookupTable(
+                &nbp->nbfp_comb, &nbp->nbfp_comb_texobj, nbatParams.nbfp_comb.data(), 2 * numTypes, deviceContext);
+    }
+}
+
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   const bool                      bLocalAndNonlocal)
+{
+    auto* nb                              = new NbnxmGpu();
+    nb->deviceContext_                    = &deviceStreamManager.context();
+    nb->atdat                             = new sycl_atomdata_t;
+    nb->nbparam                           = new NBParamGpu;
+    nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
+    if (bLocalAndNonlocal)
+    {
+        nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
+    }
+
+    nb->bUseTwoStreams = bLocalAndNonlocal;
+
+    nb->timers  = nullptr;
+    nb->timings = nullptr;
+
+    /* init nbst */
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
+
+    init_plist(nb->plist[InteractionLocality::Local]);
+
+    /* local/non-local GPU streams */
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    nb->deviceStreams[InteractionLocality::Local] =
+            &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
+    // out-of-order. But for the time being, it will be less disruptive to keep them.
+    if (nb->bUseTwoStreams)
+    {
+        init_plist(nb->plist[InteractionLocality::NonLocal]);
+
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+    }
+
+    nb->xNonLocalCopyD2HDone = new GpuEventSynchronizer();
+
+    nb->bDoTime = false;
+
+    const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
+    const DeviceContext&            deviceContext = *nb->deviceContext_;
+
+    initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
+    initAtomdataFirst(nb, nbatParams.numTypes, deviceContext);
+
+    return nb;
+}
+
+void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
+{
+    sycl_atomdata_t*    adat        = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+
+    /* only if we have a dynamic box */
+    if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
+    {
+        GMX_ASSERT(adat->shiftVec.elementSize() == sizeof(nbatom->shift_vec[0]),
+                   "Sizes of host- and device-side shift vectors should be the same.");
+        copyToDeviceBuffer(&adat->shiftVec,
+                           reinterpret_cast<const float3*>(nbatom->shift_vec.data()),
+                           0,
+                           SHIFTS,
+                           localStream,
+                           GpuApiCallBehavior::Async,
+                           nullptr);
+        adat->shiftVecUploaded = true;
+    }
+}
+
+void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
+{
+    GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet");
+    sycl_atomdata_t*     atdat         = nb->atdat;
+    const DeviceContext& deviceContext = *nb->deviceContext_;
+    const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
+
+    int  numAtoms    = nbat->numAtoms();
+    bool reallocated = false;
+    if (numAtoms > atdat->numAlloc)
+    {
+        int numAlloc = over_alloc_small(numAtoms);
+
+        /* free up first if the arrays have already been initialized */
+        if (atdat->numAlloc != -1)
+        {
+            freeDeviceBuffer(&atdat->f);
+            freeDeviceBuffer(&atdat->xq);
+            freeDeviceBuffer(&atdat->atomTypes);
+            freeDeviceBuffer(&atdat->ljComb);
+        }
+
+        allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
+        allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
+        if (useLjCombRule(nb->nbparam->vdwType))
+        {
+            allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
+        }
+        else
+        {
+            allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
+        }
+
+        atdat->numAlloc = numAlloc;
+        reallocated     = true;
+    }
+
+    atdat->natoms       = numAtoms;
+    atdat->natoms_local = nbat->natoms_local;
+
+    /* need to clear GPU f output if realloc happened */
+    if (reallocated)
+    {
+        clearDeviceBufferAsync(&atdat->f, 0, atdat->numAlloc, localStream);
+    }
+
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        GMX_ASSERT(atdat->ljComb.elementSize() == sizeof(float2),
+                   "Size of the LJ parameters element should be equal to the size of float2.");
+        copyToDeviceBuffer(&atdat->ljComb,
+                           reinterpret_cast<const float2*>(nbat->params().lj_comb.data()),
+                           0,
+                           numAtoms,
+                           localStream,
+                           GpuApiCallBehavior::Async,
+                           nullptr);
+    }
+    else
+    {
+        GMX_ASSERT(atdat->atomTypes.elementSize() == sizeof(nbat->params().type[0]),
+                   "Sizes of host- and device-side atom types should be the same.");
+        copyToDeviceBuffer(&atdat->atomTypes,
+                           nbat->params().type.data(),
+                           0,
+                           numAtoms,
+                           localStream,
+                           GpuApiCallBehavior::Async,
+                           nullptr);
+    }
+}
+
+void gpu_free(NbnxmGpu* nb)
+{
+    if (nb == nullptr)
+    {
+        return;
+    }
+
+    sycl_atomdata_t* atdat   = nb->atdat;
+    NBParamGpu*      nbparam = nb->nbparam;
+
+    if ((!nbparam->coulomb_tab)
+        && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
+    {
+        destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+    }
+
+    if (!useLjCombRule(nb->nbparam->vdwType))
+    {
+        destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
+    }
+
+    if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
+    {
+        destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+    }
+
+    /* Free plist */
+    auto* plist = nb->plist[InteractionLocality::Local];
+    delete plist;
+    if (nb->bUseTwoStreams)
+    {
+        auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
+        delete plist_nl;
+    }
+
+    /* Free nbst */
+    pfree(nb->nbst.e_lj);
+    nb->nbst.e_lj = nullptr;
+
+    pfree(nb->nbst.e_el);
+    nb->nbst.e_el = nullptr;
+
+    pfree(nb->nbst.fshift);
+    nb->nbst.fshift = nullptr;
+
+    delete atdat;
+    delete nbparam;
+    delete nb;
+}
+
+int gpu_min_ci_balanced(NbnxmGpu* nb)
+{
+    // SYCL-TODO: Logic and magic values taken from OpenCL
+    static constexpr unsigned int balancedFactor = 50;
+    if (nb == nullptr)
+    {
+        return 0;
+    }
+    const cl::sycl::device device = nb->deviceContext_->deviceInfo().syclDevice;
+    const int numComputeUnits     = device.get_info<cl::sycl::info::device::max_compute_units>();
+    return balancedFactor * numComputeUnits;
+}
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
new file mode 100644
index 0000000000..a68c9d8b2f
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -0,0 +1,1016 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief
+ *  NBNXM SYCL kernels
+ *
+ *  \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "nbnxm_sycl_kernel.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/math/functions.h"
+#include "gromacs/mdtypes/simulation_workload.h"
+#include "gromacs/pbcutil/ishift.h"
+#include "gromacs/utility/template_mp.h"
+
+#include "nbnxm_sycl_kernel_utils.h"
+#include "nbnxm_sycl_types.h"
+
+namespace Nbnxm
+{
+
+//! \brief Set of boolean constants mimicking preprocessor macros.
+template<enum ElecType elecType, enum VdwType vdwType>
+struct EnergyFunctionProperties {
+    static constexpr bool elecCutoff = (elecType == ElecType::Cut); ///< EL_CUTOFF
+    static constexpr bool elecRF     = (elecType == ElecType::RF);  ///< EL_RF
+    static constexpr bool elecEwaldAna =
+            (elecType == ElecType::EwaldAna || elecType == ElecType::EwaldAnaTwin); ///< EL_EWALD_ANA
+    static constexpr bool elecEwaldTab =
+            (elecType == ElecType::EwaldTab || elecType == ElecType::EwaldTabTwin); ///< EL_EWALD_TAB
+    static constexpr bool elecEwaldTwin =
+            (elecType == ElecType::EwaldAnaTwin || elecType == ElecType::EwaldTabTwin);
+    static constexpr bool elecEwald        = (elecEwaldAna || elecEwaldTab); ///< EL_EWALD_ANY
+    static constexpr bool vdwCombLB        = (vdwType == VdwType::CutCombLB);
+    static constexpr bool vdwCombGeom      = (vdwType == VdwType::CutCombGeom); ///< LJ_COMB_GEOM
+    static constexpr bool vdwComb          = (vdwCombLB || vdwCombGeom);        ///< LJ_COMB
+    static constexpr bool vdwEwaldCombGeom = (vdwType == VdwType::EwaldGeom); ///< LJ_EWALD_COMB_GEOM
+    static constexpr bool vdwEwaldCombLB   = (vdwType == VdwType::EwaldLB);   ///< LJ_EWALD_COMB_LB
+    static constexpr bool vdwEwald         = (vdwEwaldCombGeom || vdwEwaldCombLB); ///< LJ_EWALD
+    static constexpr bool vdwFSwitch       = (vdwType == VdwType::FSwitch); ///< LJ_FORCE_SWITCH
+    static constexpr bool vdwPSwitch       = (vdwType == VdwType::PSwitch); ///< LJ_POT_SWITCH
+};
+
+//! \brief Templated constants to shorten kernel function declaration.
+//@{
+template<enum VdwType vdwType>
+constexpr bool ljComb = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwComb;
+
+template<enum ElecType elecType> // Yes, ElecType
+constexpr bool vdwCutoffCheck = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTwin;
+
+template<enum ElecType elecType>
+constexpr bool elecEwald = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwald;
+
+template<enum ElecType elecType>
+constexpr bool elecEwaldTab = EnergyFunctionProperties<elecType, VdwType::Count>().elecEwaldTab;
+
+template<enum VdwType vdwType>
+constexpr bool ljEwald = EnergyFunctionProperties<ElecType::Count, vdwType>().vdwEwald;
+//@}
+
+using cl::sycl::access::fence_space;
+using cl::sycl::access::mode;
+using cl::sycl::access::target;
+
+static inline void convertSigmaEpsilonToC6C12(const float                  sigma,
+                                              const float                  epsilon,
+                                              cl::sycl::private_ptr<float> c6,
+                                              cl::sycl::private_ptr<float> c12)
+{
+    const float sigma2 = sigma * sigma;
+    const float sigma6 = sigma2 * sigma2 * sigma2;
+    *c6                = epsilon * sigma6;
+    *c12               = (*c6) * sigma6;
+}
+
+template<bool doCalcEnergies>
+static inline void ljForceSwitch(const shift_consts_t         dispersionShift,
+                                 const shift_consts_t         repulsionShift,
+                                 const float                  rVdwSwitch,
+                                 const float                  c6,
+                                 const float                  c12,
+                                 const float                  rInv,
+                                 const float                  r2,
+                                 cl::sycl::private_ptr<float> fInvR,
+                                 cl::sycl::private_ptr<float> eLJ)
+{
+    /* force switch constants */
+    const float dispShiftV2 = dispersionShift.c2;
+    const float dispShiftV3 = dispersionShift.c3;
+    const float repuShiftV2 = repulsionShift.c2;
+    const float repuShiftV3 = repulsionShift.c3;
+
+    const float r       = r2 * rInv;
+    const float rSwitch = cl::sycl::fdim(r, rVdwSwitch); // max(r - rVdwSwitch, 0)
+
+    *fInvR += -c6 * (dispShiftV2 + dispShiftV3 * rSwitch) * rSwitch * rSwitch * rInv
+              + c12 * (repuShiftV2 + repuShiftV3 * rSwitch) * rSwitch * rSwitch * rInv;
+
+    if constexpr (doCalcEnergies)
+    {
+        const float dispShiftF2 = dispShiftV2 / 3;
+        const float dispShiftF3 = dispShiftV3 / 4;
+        const float repuShiftF2 = repuShiftV2 / 3;
+        const float repuShiftF3 = repuShiftV3 / 4;
+        *eLJ += c6 * (dispShiftF2 + dispShiftF3 * rSwitch) * rSwitch * rSwitch * rSwitch
+                - c12 * (repuShiftF2 + repuShiftF3 * rSwitch) * rSwitch * rSwitch * rSwitch;
+    }
+}
+
+//! \brief Fetch C6 grid contribution coefficients and return the product of these.
+template<enum VdwType vdwType>
+static inline float calculateLJEwaldC6Grid(const DeviceAccessor<float, mode::read> a_nbfpComb,
+                                           const int                               typeI,
+                                           const int                               typeJ)
+{
+    if constexpr (vdwType == VdwType::EwaldGeom)
+    {
+        return a_nbfpComb[2 * typeI] * a_nbfpComb[2 * typeJ];
+    }
+    else
+    {
+        static_assert(vdwType == VdwType::EwaldLB);
+        /* sigma and epsilon are scaled to give 6*C6 */
+        const float c6_i  = a_nbfpComb[2 * typeI];
+        const float c12_i = a_nbfpComb[2 * typeI + 1];
+        const float c6_j  = a_nbfpComb[2 * typeJ];
+        const float c12_j = a_nbfpComb[2 * typeJ + 1];
+
+        const float sigma   = c6_i + c6_j;
+        const float epsilon = c12_i * c12_j;
+
+        const float sigma2 = sigma * sigma;
+        return epsilon * sigma2 * sigma2 * sigma2;
+    }
+}
+
+//! Calculate LJ-PME grid force contribution with geometric or LB combination rule.
+template<bool doCalcEnergies, enum VdwType vdwType>
+static inline void ljEwaldComb(const DeviceAccessor<float, mode::read> a_nbfpComb,
+                               const float                             sh_lj_ewald,
+                               const int                               typeI,
+                               const int                               typeJ,
+                               const float                             r2,
+                               const float                             r2Inv,
+                               const float                             lje_coeff2,
+                               const float                             lje_coeff6_6,
+                               const float                             int_bit,
+                               cl::sycl::private_ptr<float>            fInvR,
+                               cl::sycl::private_ptr<float>            eLJ)
+{
+    const float c6grid = calculateLJEwaldC6Grid<vdwType>(a_nbfpComb, typeI, typeJ);
+
+    /* Recalculate inv_r6 without exclusion mask */
+    const float inv_r6_nm = r2Inv * r2Inv * r2Inv;
+    const float cr2       = lje_coeff2 * r2;
+    const float expmcr2   = cl::sycl::exp(-cr2);
+    const float poly      = 1.0F + cr2 + 0.5F * cr2 * cr2;
+
+    /* Subtract the grid force from the total LJ force */
+    *fInvR += c6grid * (inv_r6_nm - expmcr2 * (inv_r6_nm * poly + lje_coeff6_6)) * r2Inv;
+
+    if constexpr (doCalcEnergies)
+    {
+        /* Shift should be applied only to real LJ pairs */
+        const float sh_mask = sh_lj_ewald * int_bit;
+        *eLJ += c_oneSixth * c6grid * (inv_r6_nm * (1.0F - expmcr2 * poly) + sh_mask);
+    }
+}
+
+/*! \brief Apply potential switch. */
+template<bool doCalcEnergies>
+static inline void ljPotentialSwitch(const switch_consts_t        vdwSwitch,
+                                     const float                  rVdwSwitch,
+                                     const float                  rInv,
+                                     const float                  r2,
+                                     cl::sycl::private_ptr<float> fInvR,
+                                     cl::sycl::private_ptr<float> eLJ)
+{
+    /* potential switch constants */
+    const float switchV3 = vdwSwitch.c3;
+    const float switchV4 = vdwSwitch.c4;
+    const float switchV5 = vdwSwitch.c5;
+    const float switchF2 = 3 * vdwSwitch.c3;
+    const float switchF3 = 4 * vdwSwitch.c4;
+    const float switchF4 = 5 * vdwSwitch.c5;
+
+    const float r       = r2 * rInv;
+    const float rSwitch = r - rVdwSwitch;
+
+    if (rSwitch > 0.0F)
+    {
+        const float sw =
+                1.0F + (switchV3 + (switchV4 + switchV5 * rSwitch) * rSwitch) * rSwitch * rSwitch * rSwitch;
+        const float dsw = (switchF2 + (switchF3 + switchF4 * rSwitch) * rSwitch) * rSwitch * rSwitch;
+
+        *fInvR = (*fInvR) * sw - rInv * (*eLJ) * dsw;
+        if constexpr (doCalcEnergies)
+        {
+            *eLJ *= sw;
+        }
+    }
+}
+
+
+/*! \brief Calculate analytical Ewald correction term. */
+static inline float pmeCorrF(const float z2)
+{
+    constexpr float FN6 = -1.7357322914161492954e-8F;
+    constexpr float FN5 = 1.4703624142580877519e-6F;
+    constexpr float FN4 = -0.000053401640219807709149F;
+    constexpr float FN3 = 0.0010054721316683106153F;
+    constexpr float FN2 = -0.019278317264888380590F;
+    constexpr float FN1 = 0.069670166153766424023F;
+    constexpr float FN0 = -0.75225204789749321333F;
+
+    constexpr float FD4 = 0.0011193462567257629232F;
+    constexpr float FD3 = 0.014866955030185295499F;
+    constexpr float FD2 = 0.11583842382862377919F;
+    constexpr float FD1 = 0.50736591960530292870F;
+    constexpr float FD0 = 1.0F;
+
+    const float z4 = z2 * z2;
+
+    float       polyFD0 = FD4 * z4 + FD2;
+    const float polyFD1 = FD3 * z4 + FD1;
+    polyFD0             = polyFD0 * z4 + FD0;
+    polyFD0             = polyFD1 * z2 + polyFD0;
+
+    polyFD0 = 1.0F / polyFD0;
+
+    float polyFN0 = FN6 * z4 + FN4;
+    float polyFN1 = FN5 * z4 + FN3;
+    polyFN0       = polyFN0 * z4 + FN2;
+    polyFN1       = polyFN1 * z4 + FN1;
+    polyFN0       = polyFN0 * z4 + FN0;
+    polyFN0       = polyFN1 * z2 + polyFN0;
+
+    return polyFN0 * polyFD0;
+}
+
+/*! \brief Linear interpolation using exactly two FMA operations.
+ *
+ *  Implements numeric equivalent of: (1-t)*d0 + t*d1.
+ */
+template<typename T>
+static inline T lerp(T d0, T d1, T t)
+{
+    return fma(t, d1, fma(-t, d0, d0));
+}
+
+/*! \brief Interpolate Ewald coulomb force correction using the F*r table. */
+static inline float interpolateCoulombForceR(const DeviceAccessor<float, mode::read> a_coulombTab,
+                                             const float coulombTabScale,
+                                             const float r)
+{
+    const float normalized = coulombTabScale * r;
+    const int   index      = static_cast<int>(normalized);
+    const float fraction   = normalized - index;
+
+    const float left  = a_coulombTab[index];
+    const float right = a_coulombTab[index + 1];
+
+    return lerp(left, right, fraction); // TODO: cl::sycl::mix
+}
+
+static inline void reduceForceJShuffle(float3                                  f,
+                                       const cl::sycl::nd_item<1>              itemIdx,
+                                       const int                               tidxi,
+                                       const int                               aidx,
+                                       DeviceAccessor<float, mode::read_write> a_f)
+{
+    static_assert(c_clSize == 8 || c_clSize == 4);
+    sycl_2020::sub_group sg = itemIdx.get_sub_group();
+
+    f[0] += shuffleDown(f[0], 1, sg);
+    f[1] += shuffleUp(f[1], 1, sg);
+    f[2] += shuffleDown(f[2], 1, sg);
+    if (tidxi & 1)
+    {
+        f[0] = f[1];
+    }
+
+    f[0] += shuffleDown(f[0], 2, sg);
+    f[2] += shuffleUp(f[2], 2, sg);
+    if (tidxi & 2)
+    {
+        f[0] = f[2];
+    }
+
+    if constexpr (c_clSize == 8)
+    {
+        f[0] += shuffleDown(f[0], 4, sg);
+    }
+
+    if (tidxi < 3)
+    {
+        atomicFetchAdd(a_f, 3 * aidx + tidxi, f[0]);
+    }
+}
+
+
+/*! \brief Final i-force reduction.
+ *
+ * This implementation works only with power of two array sizes.
+ */
+static inline void reduceForceIAndFShift(cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_buf,
+                                         const float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster],
+                                         const bool   calcFShift,
+                                         const cl::sycl::nd_item<1>              itemIdx,
+                                         const int                               tidxi,
+                                         const int                               tidxj,
+                                         const int                               sci,
+                                         const int                               shift,
+                                         DeviceAccessor<float, mode::read_write> a_f,
+                                         DeviceAccessor<float, mode::read_write> a_fShift)
+{
+    static constexpr int bufStride  = c_clSize * c_clSize;
+    static constexpr int clSizeLog2 = gmx::StaticLog2<c_clSize>::value;
+    const int            tidx       = tidxi + tidxj * c_clSize;
+    float                fShiftBuf  = 0;
+    for (int ciOffset = 0; ciOffset < c_nbnxnGpuNumClusterPerSupercluster; ciOffset++)
+    {
+        const int aidx = (sci * c_nbnxnGpuNumClusterPerSupercluster + ciOffset) * c_clSize + tidxi;
+        /* store i forces in shmem */
+        sm_buf[tidx]                 = fCiBuf[ciOffset][0];
+        sm_buf[bufStride + tidx]     = fCiBuf[ciOffset][1];
+        sm_buf[2 * bufStride + tidx] = fCiBuf[ciOffset][2];
+        itemIdx.barrier(fence_space::local_space);
+
+        /* Reduce the initial c_clSize values for each i atom to half
+         * every step by using c_clSize * i threads. */
+        int i = c_clSize / 2;
+        for (int j = clSizeLog2 - 1; j > 0; j--)
+        {
+            if (tidxj < i)
+            {
+                sm_buf[tidxj * c_clSize + tidxi] += sm_buf[(tidxj + i) * c_clSize + tidxi];
+                sm_buf[bufStride + tidxj * c_clSize + tidxi] +=
+                        sm_buf[bufStride + (tidxj + i) * c_clSize + tidxi];
+                sm_buf[2 * bufStride + tidxj * c_clSize + tidxi] +=
+                        sm_buf[2 * bufStride + (tidxj + i) * c_clSize + tidxi];
+            }
+            i >>= 1;
+            itemIdx.barrier(fence_space::local_space);
+        }
+
+        /* i == 1, last reduction step, writing to global mem */
+        /* Split the reduction between the first 3 line threads
+           Threads with line id 0 will do the reduction for (float3).x components
+           Threads with line id 1 will do the reduction for (float3).y components
+           Threads with line id 2 will do the reduction for (float3).z components. */
+        if (tidxj < 3)
+        {
+            const float f =
+                    sm_buf[tidxj * bufStride + tidxi] + sm_buf[tidxj * bufStride + c_clSize + tidxi];
+            atomicFetchAdd(a_f, 3 * aidx + tidxj, f);
+            if (calcFShift)
+            {
+                fShiftBuf += f;
+            }
+        }
+        itemIdx.barrier(fence_space::local_space);
+    }
+    /* add up local shift forces into global mem */
+    if (calcFShift)
+    {
+        /* Only threads with tidxj < 3 will update fshift.
+           The threads performing the update must be the same as the threads
+           storing the reduction result above. */
+        if (tidxj < 3)
+        {
+            atomicFetchAdd(a_fShift, 3 * shift + tidxj, fShiftBuf);
+        }
+    }
+}
+
+
+/*! \brief Main kernel for NBNXM.
+ *
+ */
+template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
+auto nbnxmKernel(cl::sycl::handler&                                        cgh,
+                 DeviceAccessor<float4, mode::read>                        a_xq,
+                 DeviceAccessor<float, mode::read_write>                   a_f,
+                 DeviceAccessor<float3, mode::read>                        a_shiftVec,
+                 DeviceAccessor<float, mode::read_write>                   a_fShift,
+                 OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyElec,
+                 OptionalAccessor<float, mode::read_write, doCalcEnergies> a_energyVdw,
+                 DeviceAccessor<nbnxn_cj4_t, doPruneNBL ? mode::read_write : mode::read> a_plistCJ4,
+                 DeviceAccessor<nbnxn_sci_t, mode::read>                                 a_plistSci,
+                 DeviceAccessor<nbnxn_excl_t, mode::read>                    a_plistExcl,
+                 OptionalAccessor<float2, mode::read, ljComb<vdwType>>       a_ljComb,
+                 OptionalAccessor<int, mode::read, !ljComb<vdwType>>         a_atomTypes,
+                 OptionalAccessor<float, mode::read, !ljComb<vdwType>>       a_nbfp,
+                 OptionalAccessor<float, mode::read, ljEwald<vdwType>>       a_nbfpComb,
+                 OptionalAccessor<float, mode::read, elecEwaldTab<elecType>> a_coulombTab,
+                 const int                                                   numTypes,
+                 const float                                                 rCoulombSq,
+                 const float                                                 rVdwSq,
+                 const float                                                 twoKRf,
+                 const float                                                 ewaldBeta,
+                 const float                                                 rlistOuterSq,
+                 const float                                                 ewaldShift,
+                 const float                                                 epsFac,
+                 const float                                                 ewaldCoeffLJ,
+                 const float                                                 cRF,
+                 const shift_consts_t                                        dispersionShift,
+                 const shift_consts_t                                        repulsionShift,
+                 const switch_consts_t                                       vdwSwitch,
+                 const float                                                 rVdwSwitch,
+                 const float                                                 ljEwaldShift,
+                 const float                                                 coulombTabScale,
+                 const bool                                                  calcShift)
+{
+    static constexpr EnergyFunctionProperties<elecType, vdwType> props;
+
+    cgh.require(a_xq);
+    cgh.require(a_f);
+    cgh.require(a_shiftVec);
+    cgh.require(a_fShift);
+    if constexpr (doCalcEnergies)
+    {
+        cgh.require(a_energyElec);
+        cgh.require(a_energyVdw);
+    }
+    cgh.require(a_plistCJ4);
+    cgh.require(a_plistSci);
+    cgh.require(a_plistExcl);
+    if constexpr (!props.vdwComb)
+    {
+        cgh.require(a_atomTypes);
+        cgh.require(a_nbfp);
+    }
+    else
+    {
+        cgh.require(a_ljComb);
+    }
+    if constexpr (props.vdwEwald)
+    {
+        cgh.require(a_nbfpComb);
+    }
+    if constexpr (props.elecEwaldTab)
+    {
+        cgh.require(a_coulombTab);
+    }
+
+    // shmem buffer for i x+q pre-loading
+    cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+            cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
+
+    // shmem buffer for force reduction
+    // SYCL-TODO: Make into 3D; section 4.7.6.11 of SYCL2020 specs
+    cl::sycl::accessor<float, 1, mode::read_write, target::local> sm_reductionBuffer(
+            cl::sycl::range<1>(c_clSize * c_clSize * DIM), cgh);
+
+    auto sm_atomTypeI = [&]() {
+        if constexpr (!props.vdwComb)
+        {
+            return cl::sycl::accessor<int, 2, mode::read_write, target::local>(
+                    cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
+        }
+        else
+        {
+            return nullptr;
+        }
+    }();
+
+    auto sm_ljCombI = [&]() {
+        if constexpr (props.vdwComb)
+        {
+            return cl::sycl::accessor<float2, 2, mode::read_write, target::local>(
+                    cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
+        }
+        else
+        {
+            return nullptr;
+        }
+    }();
+
+    /* Flag to control the calculation of exclusion forces in the kernel
+     * We do that with Ewald (elec/vdw) and RF. Cut-off only has exclusion
+     * energy terms. */
+    constexpr bool doExclusionForces =
+            (props.elecEwald || props.elecRF || props.vdwEwald || (props.elecCutoff && doCalcEnergies));
+
+    constexpr int subGroupSize = c_clSize * c_clSize / 2;
+
+    return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(subGroupSize)]]
+    {
+        /* thread/block/warp id-s */
+        const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
+        const unsigned        tidxi   = localId[0];
+        const unsigned        tidxj   = localId[1];
+        const unsigned        tidx    = tidxj * c_clSize + tidxi;
+        const unsigned        tidxz   = 0;
+
+        // Group indexing was flat originally, no need to unflatten it.
+        const unsigned bidx = itemIdx.get_group(0);
+
+        const sycl_2020::sub_group sg = itemIdx.get_sub_group();
+        // Better use sg.get_group_range, but too much of the logic relies on it anyway
+        const unsigned widx = tidx / subGroupSize;
+
+        float3 fCiBuf[c_nbnxnGpuNumClusterPerSupercluster]; // i force buffer
+        for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+        {
+            fCiBuf[i] = float3(0.0F, 0.0F, 0.0F);
+        }
+
+        const nbnxn_sci_t nbSci     = a_plistSci[bidx];
+        const int         sci       = nbSci.sci;
+        const int         cij4Start = nbSci.cj4_ind_start;
+        const int         cij4End   = nbSci.cj4_ind_end;
+
+        // Only needed if props.elecEwaldAna
+        const float beta2 = ewaldBeta * ewaldBeta;
+        const float beta3 = ewaldBeta * ewaldBeta * ewaldBeta;
+
+        for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += c_clSize)
+        {
+            /* Pre-load i-atom x and q into shared memory */
+            const int             ci       = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
+            const int             ai       = ci * c_clSize + tidxi;
+            const cl::sycl::id<2> cacheIdx = cl::sycl::id<2>(tidxj + i, tidxi);
+
+            const float3 shift = a_shiftVec[nbSci.shift];
+            float4       xqi   = a_xq[ai];
+            xqi += float4(shift[0], shift[1], shift[2], 0.0F);
+            xqi[3] *= epsFac;
+            sm_xq[cacheIdx] = xqi;
+
+            if constexpr (!props.vdwComb)
+            {
+                // Pre-load the i-atom types into shared memory
+                sm_atomTypeI[cacheIdx] = a_atomTypes[ai];
+            }
+            else
+            {
+                // Pre-load the LJ combination parameters into shared memory
+                sm_ljCombI[cacheIdx] = a_ljComb[ai];
+            }
+        }
+        itemIdx.barrier(fence_space::local_space);
+
+        float ewaldCoeffLJ_2, ewaldCoeffLJ_6_6; // Only needed if (props.vdwEwald)
+        if constexpr (props.vdwEwald)
+        {
+            ewaldCoeffLJ_2   = ewaldCoeffLJ * ewaldCoeffLJ;
+            ewaldCoeffLJ_6_6 = ewaldCoeffLJ_2 * ewaldCoeffLJ_2 * ewaldCoeffLJ_2 * c_oneSixth;
+        }
+
+        float energyVdw, energyElec; // Only needed if (doCalcEnergies)
+        if constexpr (doCalcEnergies)
+        {
+            energyVdw = energyElec = 0.0F;
+        }
+        if constexpr (doCalcEnergies && doExclusionForces)
+        {
+            if (nbSci.shift == CENTRAL && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
+            {
+                // we have the diagonal: add the charge and LJ self interaction energy term
+                for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+                {
+                    // TODO: Are there other options?
+                    if constexpr (props.elecEwald || props.elecRF || props.elecCutoff)
+                    {
+                        const float qi = sm_xq[i][tidxi][3];
+                        energyElec += qi * qi;
+                    }
+                    if constexpr (props.vdwEwald)
+                    {
+                        energyVdw +=
+                                a_nbfp[a_atomTypes[(sci * c_nbnxnGpuNumClusterPerSupercluster + i) * c_clSize + tidxi]
+                                       * (numTypes + 1) * 2];
+                    }
+                }
+                /* divide the self term(s) equally over the j-threads, then multiply with the coefficients. */
+                if constexpr (props.vdwEwald)
+                {
+                    energyVdw /= c_clSize;
+                    energyVdw *= 0.5F * c_oneSixth * ewaldCoeffLJ_6_6; // c_OneTwelfth?
+                }
+                if constexpr (props.elecRF || props.elecCutoff)
+                {
+                    // Correct for epsfac^2 due to adding qi^2 */
+                    energyElec /= epsFac * c_clSize;
+                    energyElec *= -0.5F * cRF;
+                }
+                if constexpr (props.elecEwald)
+                {
+                    // Correct for epsfac^2 due to adding qi^2 */
+                    energyElec /= epsFac * c_clSize;
+                    energyElec *= -ewaldBeta * c_OneOverSqrtPi; /* last factor 1/sqrt(pi) */
+                }
+            } // (nbSci.shift == CENTRAL && a_plistCJ4[cij4Start].cj[0] == sci * c_nbnxnGpuNumClusterPerSupercluster)
+        }     // (doCalcEnergies && doExclusionForces)
+
+        // Only needed if (doExclusionForces)
+        const bool nonSelfInteraction = !(nbSci.shift == CENTRAL & tidxj <= tidxi);
+
+        // loop over the j clusters = seen by any of the atoms in the current super-cluster
+        for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += 1)
+        {
+            unsigned imask = a_plistCJ4[j4].imei[widx].imask;
+            if (!doPruneNBL && !imask)
+            {
+                continue;
+            }
+            const int wexclIdx = a_plistCJ4[j4].imei[widx].excl_ind;
+            const unsigned wexcl = a_plistExcl[wexclIdx].pair[tidx & (subGroupSize - 1)]; // sg.get_local_linear_id()
+            for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
+            {
+                const bool maskSet =
+                        imask & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster));
+                if (!maskSet)
+                {
+                    continue;
+                }
+                unsigned  maskJI = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
+                const int cj     = a_plistCJ4[j4].cj[jm];
+                const int aj     = cj * c_clSize + tidxj;
+
+                // load j atom data
+                const float4 xqj = a_xq[aj];
+
+                const float3 xj(xqj[0], xqj[1], xqj[2]);
+                const float  qj = xqj[3];
+                int          atomTypeJ; // Only needed if (!props.vdwComb)
+                float2       ljCombJ;   // Only needed if (props.vdwComb)
+                if constexpr (props.vdwComb)
+                {
+                    ljCombJ = a_ljComb[aj];
+                }
+                else
+                {
+                    atomTypeJ = a_atomTypes[aj];
+                }
+
+                float3 fCjBuf(0.0F, 0.0F, 0.0F);
+
+                for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+                {
+                    if (imask & maskJI)
+                    {
+                        // i cluster index
+                        const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + i;
+                        // all threads load an atom from i cluster ci into shmem!
+                        const float4 xqi = sm_xq[i][tidxi];
+                        const float3 xi(xqi[0], xqi[1], xqi[2]);
+
+                        // distance between i and j atoms
+                        const float3 rv = xi - xj;
+                        float        r2 = norm2(rv);
+
+                        if constexpr (doPruneNBL)
+                        {
+                            /* If _none_ of the atoms pairs are in cutoff range,
+                             * the bit corresponding to the current
+                             * cluster-pair in imask gets set to 0. */
+                            if (!sycl_2020::group_any_of(sg, r2 < rlistOuterSq))
+                            {
+                                imask &= ~maskJI;
+                            }
+                        }
+                        const float pairExclMask = (wexcl & maskJI) ? 1.0F : 0.0F;
+
+                        // cutoff & exclusion check
+
+                        const bool notExcluded = doExclusionForces ? (nonSelfInteraction | (ci != cj))
+                                                                   : (wexcl & maskJI);
+
+                        // SYCL-TODO: Check optimal way of branching here.
+                        if ((r2 < rCoulombSq) && notExcluded)
+                        {
+                            const float qi = xqi[3];
+                            int         atomTypeI; // Only needed if (!props.vdwComb)
+                            float       c6, c12, sigma, epsilon;
+
+                            if constexpr (!props.vdwComb)
+                            {
+                                /* LJ 6*C6 and 12*C12 */
+                                atomTypeI     = sm_atomTypeI[i][tidxi];
+                                const int idx = (numTypes * atomTypeI + atomTypeJ) * 2;
+                                c6            = a_nbfp[idx]; // TODO: Make a_nbfm into float2
+                                c12           = a_nbfp[idx + 1];
+                            }
+                            else
+                            {
+                                const float2 ljCombI = sm_ljCombI[i][tidxi];
+                                if constexpr (props.vdwCombGeom)
+                                {
+                                    c6  = ljCombI[0] * ljCombJ[0];
+                                    c12 = ljCombI[1] * ljCombJ[1];
+                                }
+                                else
+                                {
+                                    static_assert(props.vdwCombLB);
+                                    // LJ 2^(1/6)*sigma and 12*epsilon
+                                    sigma   = ljCombI[0] + ljCombJ[0];
+                                    epsilon = ljCombI[1] * ljCombJ[1];
+                                    if constexpr (doCalcEnergies)
+                                    {
+                                        convertSigmaEpsilonToC6C12(sigma, epsilon, &c6, &c12);
+                                    }
+                                } // props.vdwCombGeom
+                            }     // !props.vdwComb
+
+                            // Ensure distance do not become so small that r^-12 overflows
+                            r2 = std::max(r2, c_nbnxnMinDistanceSquared);
+                            // SYCL-TODO: sycl::half_precision::rsqrt?
+                            const float rInv  = cl::sycl::native::rsqrt(r2);
+                            const float r2Inv = rInv * rInv;
+                            float       r6Inv, fInvR, energyLJPair;
+                            if constexpr (!props.vdwCombLB || doCalcEnergies)
+                            {
+                                r6Inv = r2Inv * r2Inv * r2Inv;
+                                if constexpr (doExclusionForces)
+                                {
+                                    // SYCL-TODO: Check if true for SYCL
+                                    /* We could mask r2Inv, but with Ewald masking both
+                                     * r6Inv and fInvR is faster */
+                                    r6Inv *= pairExclMask;
+                                }
+                                fInvR = r6Inv * (c12 * r6Inv - c6) * r2Inv;
+                            }
+                            else
+                            {
+                                float sig_r  = sigma * rInv;
+                                float sig_r2 = sig_r * sig_r;
+                                float sig_r6 = sig_r2 * sig_r2 * sig_r2;
+                                if constexpr (doExclusionForces)
+                                {
+                                    sig_r6 *= pairExclMask;
+                                }
+                                fInvR = epsilon * sig_r6 * (sig_r6 - 1.0F) * r2Inv;
+                            } // (!props.vdwCombLB || doCalcEnergies)
+                            if constexpr (doCalcEnergies || props.vdwPSwitch)
+                            {
+                                energyLJPair = pairExclMask
+                                               * (c12 * (r6Inv * r6Inv + repulsionShift.cpot) * c_oneTwelfth
+                                                  - c6 * (r6Inv + dispersionShift.cpot) * c_oneSixth);
+                            }
+                            if constexpr (props.vdwFSwitch)
+                            {
+                                ljForceSwitch<doCalcEnergies>(
+                                        dispersionShift, repulsionShift, rVdwSwitch, c6, c12, rInv, r2, &fInvR, &energyLJPair);
+                            }
+                            if constexpr (props.vdwEwald)
+                            {
+                                ljEwaldComb<doCalcEnergies, vdwType>(a_nbfpComb,
+                                                                     ljEwaldShift,
+                                                                     atomTypeI,
+                                                                     atomTypeJ,
+                                                                     r2,
+                                                                     r2Inv,
+                                                                     ewaldCoeffLJ_2,
+                                                                     ewaldCoeffLJ_6_6,
+                                                                     pairExclMask,
+                                                                     &fInvR,
+                                                                     &energyLJPair);
+                            } // (props.vdwEwald)
+                            if constexpr (props.vdwPSwitch)
+                            {
+                                ljPotentialSwitch<doCalcEnergies>(
+                                        vdwSwitch, rVdwSwitch, rInv, r2, &fInvR, &energyLJPair);
+                            }
+                            if constexpr (props.elecEwaldTwin)
+                            {
+                                // Separate VDW cut-off check to enable twin-range cut-offs
+                                // (rVdw < rCoulomb <= rList)
+                                const float vdwInRange = (r2 < rVdwSq) ? 1.0F : 0.0F;
+                                fInvR *= vdwInRange;
+                                if constexpr (doCalcEnergies)
+                                {
+                                    energyLJPair *= vdwInRange;
+                                }
+                            }
+                            if constexpr (doCalcEnergies)
+                            {
+                                energyVdw += energyLJPair;
+                            }
+
+                            if constexpr (props.elecCutoff)
+                            {
+                                if constexpr (doExclusionForces)
+                                {
+                                    fInvR += qi * qj * pairExclMask * r2Inv * rInv;
+                                }
+                                else
+                                {
+                                    fInvR += qi * qj * r2Inv * rInv;
+                                }
+                            }
+                            if constexpr (props.elecRF)
+                            {
+                                fInvR += qi * qj * (pairExclMask * r2Inv * rInv - twoKRf);
+                            }
+                            if constexpr (props.elecEwaldAna)
+                            {
+                                fInvR += qi * qj
+                                         * (pairExclMask * r2Inv * rInv + pmeCorrF(beta2 * r2) * beta3);
+                            }
+                            if constexpr (props.elecEwaldTab)
+                            {
+                                fInvR += qi * qj
+                                         * (pairExclMask * r2Inv
+                                            - interpolateCoulombForceR(
+                                                      a_coulombTab, coulombTabScale, r2 * rInv))
+                                         * rInv;
+                            }
+
+                            if constexpr (doCalcEnergies)
+                            {
+                                if constexpr (props.elecCutoff)
+                                {
+                                    energyElec += qi * qj * (pairExclMask * rInv - cRF);
+                                }
+                                if constexpr (props.elecRF)
+                                {
+                                    energyElec +=
+                                            qi * qj * (pairExclMask * rInv + 0.5f * twoKRf * r2 - cRF);
+                                }
+                                if constexpr (props.elecEwald)
+                                {
+                                    energyElec +=
+                                            qi * qj
+                                            * (rInv * (pairExclMask - cl::sycl::erf(r2 * rInv * ewaldBeta))
+                                               - pairExclMask * ewaldShift);
+                                }
+                            }
+
+                            const float3 forceIJ = rv * fInvR;
+
+                            /* accumulate j forces in registers */
+                            fCjBuf -= forceIJ;
+                            /* accumulate i forces in registers */
+                            fCiBuf[i] += forceIJ;
+                        } // (r2 < rCoulombSq) && notExcluded
+                    }     // (imask & maskJI)
+                    /* shift the mask bit by 1 */
+                    maskJI += maskJI;
+                } // for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+                /* reduce j forces */
+                reduceForceJShuffle(fCjBuf, itemIdx, tidxi, aj, a_f);
+            } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
+            if constexpr (doPruneNBL)
+            {
+                /* Update the imask with the new one which does not contain the
+                 * out of range clusters anymore. */
+                a_plistCJ4[j4].imei[widx].imask = imask;
+            }
+        } // for (int j4 = cij4Start; j4 < cij4End; j4 += 1)
+
+        /* skip central shifts when summing shift forces */
+        const bool doCalcShift = (calcShift && !(nbSci.shift == CENTRAL));
+
+        reduceForceIAndFShift(
+                sm_reductionBuffer, fCiBuf, doCalcShift, itemIdx, tidxi, tidxj, sci, nbSci.shift, a_f, a_fShift);
+
+        if constexpr (doCalcEnergies)
+        {
+            const float energyVdwGroup = sycl_2020::group_reduce(
+                    itemIdx.get_group(), energyVdw, 0.0F, sycl_2020::plus<float>());
+            const float energyElecGroup = sycl_2020::group_reduce(
+                    itemIdx.get_group(), energyElec, 0.0F, sycl_2020::plus<float>());
+
+            if (tidx == 0)
+            {
+                atomicFetchAdd(a_energyVdw, 0, energyVdwGroup);
+                atomicFetchAdd(a_energyElec, 0, energyElecGroup);
+            }
+        }
+    };
+}
+
+// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
+template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType>
+class NbnxmKernelName;
+
+template<bool doPruneNBL, bool doCalcEnergies, enum ElecType elecType, enum VdwType vdwType, class... Args>
+cl::sycl::event launchNbnxmKernel(const DeviceStream& deviceStream, const int numSci, Args&&... args)
+{
+    // Should not be needed for SYCL2020.
+    using kernelNameType = NbnxmKernelName<doPruneNBL, doCalcEnergies, elecType, vdwType>;
+
+    /* Kernel launch config:
+     * - The thread block dimensions match the size of i-clusters, j-clusters,
+     *   and j-cluster concurrency, in x, y, and z, respectively.
+     * - The 1D block-grid contains as many blocks as super-clusters.
+     */
+    const int                   numBlocks = numSci;
+    const cl::sycl::range<3>    blockSize{ c_clSize, c_clSize, 1 };
+    const cl::sycl::range<3>    globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
+    const cl::sycl::nd_range<3> range{ globalSize, blockSize };
+
+    cl::sycl::queue q = deviceStream.stream();
+
+    cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
+        auto kernel = nbnxmKernel<doPruneNBL, doCalcEnergies, elecType, vdwType>(
+                cgh, std::forward<Args>(args)...);
+        cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+    });
+
+    return e;
+}
+
+template<class... Args>
+cl::sycl::event chooseAndLaunchNbnxmKernel(bool          doPruneNBL,
+                                           bool          doCalcEnergies,
+                                           enum ElecType elecType,
+                                           enum VdwType  vdwType,
+                                           Args&&... args)
+{
+    return gmx::dispatchTemplatedFunction(
+            [&](auto doPruneNBL_, auto doCalcEnergies_, auto elecType_, auto vdwType_) {
+                return launchNbnxmKernel<doPruneNBL_, doCalcEnergies_, elecType_, vdwType_>(
+                        std::forward<Args>(args)...);
+            },
+            doPruneNBL,
+            doCalcEnergies,
+            elecType,
+            vdwType);
+}
+
+void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
+{
+    sycl_atomdata_t*    adat         = nb->atdat;
+    NBParamGpu*         nbp          = nb->nbparam;
+    gpu_plist*          plist        = nb->plist[iloc];
+    const bool          doPruneNBL   = (plist->haveFreshList && !nb->didPrune[iloc]);
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+    // Casting to float simplifies using atomic ops in the kernel
+    cl::sycl::buffer<float3, 1> f(*adat->f.buffer_);
+    auto                        fAsFloat = f.reinterpret<float, 1>(f.get_count() * DIM);
+    cl::sycl::buffer<float3, 1> fShift(*adat->fShift.buffer_);
+    auto fShiftAsFloat = fShift.reinterpret<float, 1>(fShift.get_count() * DIM);
+
+    cl::sycl::event e = chooseAndLaunchNbnxmKernel(doPruneNBL,
+                                                   stepWork.computeEnergy,
+                                                   nbp->elecType,
+                                                   nbp->vdwType,
+                                                   deviceStream,
+                                                   plist->nsci,
+                                                   adat->xq,
+                                                   fAsFloat,
+                                                   adat->shiftVec,
+                                                   fShiftAsFloat,
+                                                   adat->eElec,
+                                                   adat->eLJ,
+                                                   plist->cj4,
+                                                   plist->sci,
+                                                   plist->excl,
+                                                   adat->ljComb,
+                                                   adat->atomTypes,
+                                                   nbp->nbfp,
+                                                   nbp->nbfp_comb,
+                                                   nbp->coulomb_tab,
+                                                   adat->numTypes,
+                                                   nbp->rcoulomb_sq,
+                                                   nbp->rvdw_sq,
+                                                   nbp->two_k_rf,
+                                                   nbp->ewald_beta,
+                                                   nbp->rlistOuter_sq,
+                                                   nbp->sh_ewald,
+                                                   nbp->epsfac,
+                                                   nbp->ewaldcoeff_lj,
+                                                   nbp->c_rf,
+                                                   nbp->dispersion_shift,
+                                                   nbp->repulsion_shift,
+                                                   nbp->vdw_switch,
+                                                   nbp->rvdw_switch,
+                                                   nbp->sh_lj_ewald,
+                                                   nbp->coulomb_tab_scale,
+                                                   stepWork.computeVirial);
+}
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h
new file mode 100644
index 0000000000..dd7f52138d
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.h
@@ -0,0 +1,60 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Declares nbnxn sycl helper functions
+ *
+ * \ingroup module_nbnxm
+ */
+#ifndef GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H
+#define GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H
+
+// Forward declarations
+namespace gmx
+{
+enum class InteractionLocality;
+class StepWorkload;
+} // namespace gmx
+struct NbnxmGpu;
+
+namespace Nbnxm
+{
+using gmx::InteractionLocality;
+
+void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc);
+
+} // namespace Nbnxm
+
+#endif // GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_H
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
new file mode 100644
index 0000000000..cb88eb31d6
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
@@ -0,0 +1,287 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief
+ *  NBNXM SYCL kernels
+ *
+ *  \ingroup module_nbnxm
+ */
+#include "gmxpre.h"
+
+#include "nbnxm_sycl_kernel_pruneonly.h"
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/utility/template_mp.h"
+
+#include "nbnxm_sycl_kernel_utils.h"
+#include "nbnxm_sycl_types.h"
+
+using cl::sycl::access::fence_space;
+using cl::sycl::access::mode;
+using cl::sycl::access::target;
+
+namespace Nbnxm
+{
+
+/*! \brief Prune-only kernel for NBNXM.
+ *
+ */
+template<bool haveFreshList>
+auto nbnxmKernelPruneOnly(cl::sycl::handler&                            cgh,
+                          DeviceAccessor<float4, mode::read>            a_xq,
+                          DeviceAccessor<float3, mode::read>            a_shiftVec,
+                          DeviceAccessor<nbnxn_cj4_t, mode::read_write> a_plistCJ4,
+                          DeviceAccessor<nbnxn_sci_t, mode::read>       a_plistSci,
+                          DeviceAccessor<unsigned int, haveFreshList ? mode::write : mode::read> a_plistIMask,
+                          const float rlistOuterSq,
+                          const float rlistInnerSq,
+                          const int   numParts,
+                          const int   part)
+{
+    cgh.require(a_xq);
+    cgh.require(a_shiftVec);
+    cgh.require(a_plistCJ4);
+    cgh.require(a_plistSci);
+    cgh.require(a_plistIMask);
+
+    /* shmem buffer for i x+q pre-loading */
+    cl::sycl::accessor<float4, 2, mode::read_write, target::local> sm_xq(
+            cl::sycl::range<2>(c_nbnxnGpuNumClusterPerSupercluster, c_clSize), cgh);
+
+    constexpr int warpSize = c_clSize * c_clSize / 2;
+
+    /* Somewhat weird behavior inherited from OpenCL.
+     * With clSize == 4, we use sub_group size of 16 (not enforced in OpenCL implementation, but chosen
+     * by the IGC compiler), however for data layout we consider it to be 8.
+     * Setting sub_group size to 8 slows down the prune-only kernel 1.5-2 times.
+     * For clSize == But we need to set specific sub_group size >= 32 for clSize == 8 for correctness,
+     * but it causes very poor performance.
+     */
+    constexpr int requiredSubGroupSize = (c_clSize == 4) ? 16 : warpSize;
+
+    /* Requirements:
+     * Work group (block) must have range (c_clSize, c_clSize, ...) (for localId calculation, easy
+     * to change). */
+    return [=](cl::sycl::nd_item<1> itemIdx) [[intel::reqd_sub_group_size(requiredSubGroupSize)]]
+    {
+        const cl::sycl::id<3> localId = unflattenId<c_clSize, c_clSize>(itemIdx.get_local_id());
+        // thread/block/warp id-s
+        const unsigned tidxi = localId[0];
+        const unsigned tidxj = localId[1];
+        const int      tidx  = tidxj * c_clSize + tidxi;
+        const unsigned tidxz = localId[2];
+        const unsigned bidx  = itemIdx.get_group(0);
+
+        const sycl_2020::sub_group sg   = itemIdx.get_sub_group();
+        const unsigned             widx = tidx / warpSize;
+
+        // my i super-cluster's index = sciOffset + current bidx * numParts + part
+        const nbnxn_sci_t nbSci     = a_plistSci[bidx * numParts + part];
+        const int         sci       = nbSci.sci;           /* super-cluster */
+        const int         cij4Start = nbSci.cj4_ind_start; /* first ...*/
+        const int         cij4End   = nbSci.cj4_ind_end;   /* and last index of j clusters */
+
+        if (tidxz == 0)
+        {
+            for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i += c_clSize)
+            {
+                /* Pre-load i-atom x and q into shared memory */
+                const int ci = sci * c_nbnxnGpuNumClusterPerSupercluster + tidxj + i;
+                const int ai = ci * c_clSize + tidxi;
+
+                /* We don't need q, but using float4 in shmem avoids bank conflicts.
+                   (but it also wastes L2 bandwidth). */
+                const float4 xq    = a_xq[ai];
+                const float3 shift = a_shiftVec[nbSci.shift];
+                const float4 xi(xq[0] + shift[0], xq[1] + shift[1], xq[2] + shift[2], xq[3]);
+                sm_xq[tidxj + i][tidxi] = xi;
+            }
+        }
+        itemIdx.barrier(fence_space::local_space);
+
+        /* loop over the j clusters = seen by any of the atoms in the current super-cluster.
+         * The loop stride c_syclPruneKernelJ4Concurrency ensures that consecutive warps-pairs are
+         * assigned consecutive j4's entries. */
+        for (int j4 = cij4Start + tidxz; j4 < cij4End; j4 += c_syclPruneKernelJ4Concurrency)
+        {
+            unsigned imaskFull, imaskCheck, imaskNew;
+
+            if constexpr (haveFreshList)
+            {
+                /* Read the mask from the list transferred from the CPU */
+                imaskFull = a_plistCJ4[j4].imei[widx].imask;
+                /* We attempt to prune all pairs present in the original list */
+                imaskCheck = imaskFull;
+                imaskNew   = 0;
+            }
+            else
+            {
+                /* Read the mask from the "warp-pruned" by rlistOuter mask array */
+                imaskFull = a_plistIMask[j4 * c_nbnxnGpuClusterpairSplit + widx];
+                /* Read the old rolling pruned mask, use as a base for new */
+                imaskNew = a_plistCJ4[j4].imei[widx].imask;
+                /* We only need to check pairs with different mask */
+                imaskCheck = (imaskNew ^ imaskFull);
+            }
+
+            if (imaskCheck)
+            {
+                for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
+                {
+                    if (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
+                    {
+                        unsigned mask_ji = (1U << (jm * c_nbnxnGpuNumClusterPerSupercluster));
+                        // SYCL-TODO: Reevaluate prefetching methods
+                        const int cj = a_plistCJ4[j4].cj[jm];
+                        const int aj = cj * c_clSize + tidxj;
+
+                        /* load j atom data */
+                        const float4 tmp = a_xq[aj];
+                        const float3 xj(tmp[0], tmp[1], tmp[2]);
+
+                        for (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+                        {
+                            if (imaskCheck & mask_ji)
+                            {
+                                // load i-cluster coordinates from shmem
+                                const float4 xi = sm_xq[i][tidxi];
+                                // distance between i and j atoms
+                                float3 rv(xi[0], xi[1], xi[2]);
+                                rv -= xj;
+                                const float r2 = norm2(rv);
+
+                                /* If _none_ of the atoms pairs are in rlistOuter
+                                 * range, the bit corresponding to the current
+                                 * cluster-pair in imask gets set to 0. */
+                                if (haveFreshList && !(sycl_2020::group_any_of(sg, r2 < rlistOuterSq)))
+                                {
+                                    imaskFull &= ~mask_ji;
+                                }
+                                /* If any atom pair is within range, set the bit
+                                 * corresponding to the current cluster-pair. */
+                                if (sycl_2020::group_any_of(sg, r2 < rlistInnerSq))
+                                {
+                                    imaskNew |= mask_ji;
+                                }
+                            } // (imaskCheck & mask_ji)
+                            /* shift the mask bit by 1 */
+                            mask_ji += mask_ji;
+                        } // (int i = 0; i < c_nbnxnGpuNumClusterPerSupercluster; i++)
+                    } // (imaskCheck & (superClInteractionMask << (jm * c_nbnxnGpuNumClusterPerSupercluster)))
+                } // for (int jm = 0; jm < c_nbnxnGpuJgroupSize; jm++)
+
+                if constexpr (haveFreshList)
+                {
+                    /* copy the list pruned to rlistOuter to a separate buffer */
+                    a_plistIMask[j4 * c_nbnxnGpuClusterpairSplit + widx] = imaskFull;
+                }
+                /* update the imask with only the pairs up to rlistInner */
+                a_plistCJ4[j4].imei[widx].imask = imaskNew;
+            } // (imaskCheck)
+        } // for (int j4 = cij4_start + tidxz; j4 < cij4_end; j4 += c_syclPruneKernelJ4Concurrency)
+    };
+}
+
+// SYCL 1.2.1 requires providing a unique type for a kernel. Should not be needed for SYCL2020.
+template<bool haveFreshList>
+class NbnxmKernelPruneOnlyName;
+
+template<bool haveFreshList, class... Args>
+cl::sycl::event launchNbnxmKernelPruneOnly(const DeviceStream& deviceStream,
+                                           const int           numSciInPart,
+                                           Args&&... args)
+{
+    // Should not be needed for SYCL2020.
+    using kernelNameType = NbnxmKernelPruneOnlyName<haveFreshList>;
+
+    /* Kernel launch config:
+     * - The thread block dimensions match the size of i-clusters, j-clusters,
+     *   and j-cluster concurrency, in x, y, and z, respectively.
+     * - The 1D block-grid contains as many blocks as super-clusters.
+     */
+    const unsigned long         numBlocks = numSciInPart;
+    const cl::sycl::range<3>    blockSize{ c_clSize, c_clSize, c_syclPruneKernelJ4Concurrency };
+    const cl::sycl::range<3>    globalSize{ numBlocks * blockSize[0], blockSize[1], blockSize[2] };
+    const cl::sycl::nd_range<3> range{ globalSize, blockSize };
+
+    cl::sycl::queue q = deviceStream.stream();
+
+    cl::sycl::event e = q.submit([&](cl::sycl::handler& cgh) {
+        auto kernel = nbnxmKernelPruneOnly<haveFreshList>(cgh, std::forward<Args>(args)...);
+        cgh.parallel_for<kernelNameType>(flattenNDRange(range), kernel);
+    });
+
+    return e;
+}
+
+template<class... Args>
+cl::sycl::event chooseAndLaunchNbnxmKernelPruneOnly(bool haveFreshList, Args&&... args)
+{
+    return gmx::dispatchTemplatedFunction(
+            [&](auto haveFreshList_) {
+                return launchNbnxmKernelPruneOnly<haveFreshList_>(std::forward<Args>(args)...);
+            },
+            haveFreshList);
+}
+
+void launchNbnxmKernelPruneOnly(NbnxmGpu*                 nb,
+                                const InteractionLocality iloc,
+                                const int                 numParts,
+                                const int                 part,
+                                const int                 numSciInPart)
+{
+    sycl_atomdata_t*    adat          = nb->atdat;
+    NBParamGpu*         nbp           = nb->nbparam;
+    gpu_plist*          plist         = nb->plist[iloc];
+    const bool          haveFreshList = plist->haveFreshList;
+    const DeviceStream& deviceStream  = *nb->deviceStreams[iloc];
+
+    cl::sycl::event e = chooseAndLaunchNbnxmKernelPruneOnly(haveFreshList,
+                                                            deviceStream,
+                                                            numSciInPart,
+                                                            adat->xq,
+                                                            adat->shiftVec,
+                                                            plist->cj4,
+                                                            plist->sci,
+                                                            plist->imask,
+                                                            nbp->rlistOuter_sq,
+                                                            nbp->rlistInner_sq,
+                                                            numParts,
+                                                            part);
+}
+
+} // namespace Nbnxm
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h
new file mode 100644
index 0000000000..6af97b1d1e
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.h
@@ -0,0 +1,63 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Declares nbnxn sycl helper functions
+ *
+ * \ingroup module_nbnxm
+ */
+#ifndef GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H
+#define GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H
+
+// Forward declarations
+namespace gmx
+{
+enum class InteractionLocality;
+}
+struct NbnxmGpu;
+
+namespace Nbnxm
+{
+using gmx::InteractionLocality;
+
+void launchNbnxmKernelPruneOnly(NbnxmGpu*                 nb,
+                                const InteractionLocality iloc,
+                                const int                 numParts,
+                                const int                 part,
+                                const int                 numSciInPart);
+
+} // namespace Nbnxm
+
+#endif // GMX_NBNXM_SYCL_NBNXM_SYCL_KERNEL_PRUNEONLY_H
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
new file mode 100644
index 0000000000..c3aae4ebb5
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_utils.h
@@ -0,0 +1,127 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+/*! \internal \file
+ * \brief
+ * Helper functions and constants for SYCL NBNXM kernels
+ *
+ * \ingroup module_nbnxm
+ */
+#ifndef GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
+#define GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
+
+#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/nbnxm/pairlistparams.h"
+
+namespace Nbnxm
+{
+
+#ifndef GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY
+#    define GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY 4
+#endif
+/*! \brief Macro defining default for the prune kernel's j4 processing concurrency.
+ *
+ *  The GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY macro allows compile-time override.
+ */
+static constexpr int c_syclPruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY;
+
+/* Convenience constants */
+/*! \cond */
+// cluster size = number of atoms per cluster.
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
+// j-cluster size after split (4 in the current implementation).
+static constexpr int c_splitClSize = c_clSize / c_nbnxnGpuClusterpairSplit;
+// i-cluster interaction mask for a super-cluster with all c_nbnxnGpuNumClusterPerSupercluster=8 bits set.
+static constexpr unsigned superClInteractionMask = ((1U << c_nbnxnGpuNumClusterPerSupercluster) - 1U);
+
+// 1/sqrt(pi), same value as \c M_FLOAT_1_SQRTPI in other NB kernels.
+static constexpr float c_OneOverSqrtPi = 0.564189583547756F;
+// 1/6, same value as in other NB kernels.
+static constexpr float c_oneSixth = 0.16666667F;
+// 1/12, same value as in other NB kernels.
+static constexpr float c_oneTwelfth = 0.08333333F;
+/*! \endcond */
+
+/* The following functions are necessary because on some versions of Intel OpenCL RT, subgroups
+ * do not properly work (segfault or create subgroups of size 1) if used in kernels
+ * with non-1-dimensional workgroup. */
+//! \brief Convert 3D range to 1D
+static inline cl::sycl::range<1> flattenRange(cl::sycl::range<3> range3d)
+{
+    return cl::sycl::range<1>(range3d.size());
+}
+
+//! \brief Convert 3D nd_range to 1D
+static inline cl::sycl::nd_range<1> flattenNDRange(cl::sycl::nd_range<3> nd_range3d)
+{
+    return cl::sycl::nd_range<1>(flattenRange(nd_range3d.get_global_range()),
+                                 flattenRange(nd_range3d.get_local_range()));
+}
+
+//! \brief Convert flattened 1D index to 3D
+template<int rangeX, int rangeY>
+static inline cl::sycl::id<3> unflattenId(cl::sycl::id<1> id1d)
+{
+    constexpr unsigned rangeXY = rangeX * rangeY;
+    const unsigned     id      = id1d[0];
+    const unsigned     z       = id / rangeXY;
+    const unsigned     xy      = id % rangeXY;
+    return cl::sycl::id<3>(xy % rangeX, xy / rangeX, z);
+}
+
+//! \brief Convenience wrapper to do atomic addition to a global buffer
+template<cl::sycl::access::mode Mode, class IndexType>
+static inline void atomicFetchAdd(DeviceAccessor<float, Mode> acc, const IndexType idx, const float val)
+{
+    if (cl::sycl::isnormal(val))
+    {
+        sycl_2020::atomic_ref<float, sycl_2020::memory_order::relaxed, sycl_2020::memory_scope::device, cl::sycl::access::address_space::global_space>
+                fout_atomic(acc[idx]);
+        fout_atomic.fetch_add(val);
+    }
+}
+
+static inline float shuffleDown(float var, unsigned int delta, sycl_2020::sub_group sg)
+{
+    return sg.shuffle_down(var, delta);
+}
+
+static inline float shuffleUp(float var, unsigned int delta, sycl_2020::sub_group sg)
+{
+    return sg.shuffle_up(var, delta);
+}
+
+} // namespace Nbnxm
+
+#endif // GMX_NBNXM_SYCL_NBNXN_SYCL_KERNEL_UTILS_H
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
new file mode 100644
index 0000000000..edfd2b7473
--- /dev/null
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
@@ -0,0 +1,190 @@
+/*
+ * This file is part of the GROMACS molecular simulation package.
+ *
+ * Copyright (c) 2020,2021, by the GROMACS development team, led by
+ * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
+ * and including many others, as listed in the AUTHORS file in the
+ * top-level source directory and at http://www.gromacs.org.
+ *
+ * GROMACS is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public License
+ * as published by the Free Software Foundation; either version 2.1
+ * of the License, or (at your option) any later version.
+ *
+ * GROMACS is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with GROMACS; if not, see
+ * http://www.gnu.org/licenses, or write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+ *
+ * If you want to redistribute modifications to GROMACS, please
+ * consider that scientific software is very special. Version
+ * control is crucial - bugs must be traceable. We will be happy to
+ * consider code for inclusion in the official distribution, but
+ * derived work must not be called official GROMACS. Details are found
+ * in the README & COPYING files - if they are missing, get the
+ * official version at http://www.gromacs.org.
+ *
+ * To help us fund GROMACS development, we humbly ask that you cite
+ * the research papers on the package. Check out http://www.gromacs.org.
+ */
+
+/*! \internal \file
+ *  \brief
+ *  Data types used internally in the nbnxm_sycl module.
+ *
+ *  \ingroup module_nbnxm
+ */
+
+#ifndef NBNXM_SYCL_TYPES_H
+#define NBNXM_SYCL_TYPES_H
+
+#include "gromacs/gpu_utils/devicebuffer.h"
+#include "gromacs/gpu_utils/devicebuffer_sycl.h"
+#include "gromacs/gpu_utils/gmxsycl.h"
+#include "gromacs/gpu_utils/gpueventsynchronizer_sycl.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/gpu_utils/syclutils.h"
+#include "gromacs/nbnxm/gpu_types_common.h"
+#include "gromacs/nbnxm/nbnxm.h"
+#include "gromacs/nbnxm/pairlist.h"
+#include "gromacs/timing/gpu_timing.h"
+#include "gromacs/utility/enumerationhelpers.h"
+
+/*! \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ * Since SYCL buffers already have host-side storage, this is a bit redundant.
+ * But it allows prefetching of the data from GPU, and brings GPU backends closer together.
+ */
+struct nb_staging_t
+{
+    //! LJ energy
+    float* e_lj = nullptr;
+    //! electrostatic energy
+    float* e_el = nullptr;
+    //! shift forces
+    float3* fshift = nullptr;
+};
+
+/** \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
+struct sycl_atomdata_t
+{
+    //! number of atoms
+    int natoms;
+    //! number of local atoms
+    int natoms_local; //
+    //! allocation size for the atom data (xq, f)
+    int numAlloc;
+
+    //! atom coordinates + charges, size \ref natoms
+    DeviceBuffer<float4> xq;
+    //! force output array, size \ref natoms
+    DeviceBuffer<float3> f;
+
+    //! LJ energy output, size 1
+    DeviceBuffer<float> eLJ;
+    //! Electrostatics energy input, size 1
+    DeviceBuffer<float> eElec;
+
+    //! shift forces
+    DeviceBuffer<float3> fShift;
+
+    //! number of atom types
+    int numTypes;
+    //! atom type indices, size \ref natoms
+    DeviceBuffer<int> atomTypes;
+    //! sqrt(c6),sqrt(c12) size \ref natoms
+    DeviceBuffer<float2> ljComb;
+
+    //! shifts
+    DeviceBuffer<float3> shiftVec;
+    //! true if the shift vector has been uploaded
+    bool shiftVecUploaded;
+};
+
+class GpuEventSynchronizer;
+
+/*! \internal
+ * \brief Main data structure for SYCL nonbonded force calculations.
+ */
+struct NbnxmGpu
+{
+    /*! \brief GPU device context.
+     *
+     * \todo Make it constant reference, once NbnxmGpu is a proper class.
+     */
+    const DeviceContext* deviceContext_;
+    /*! \brief true if doing both local/non-local NB work on GPU */
+    bool bUseTwoStreams = false;
+    /*! \brief true indicates that the nonlocal_done event was enqueued */
+    bool bNonLocalStreamActive = false;
+    /*! \brief atom data */
+    sycl_atomdata_t* atdat = nullptr;
+    /*! \brief f buf ops cell index mapping */
+
+    NBParamGpu* nbparam = nullptr;
+    /*! \brief pair-list data structures (local and non-local) */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { { nullptr } };
+    /*! \brief staging area where fshift/energies get downloaded. Will be removed in SYCL. */
+    nb_staging_t nbst;
+    /*! \brief local and non-local GPU streams */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
+
+    /*! \brief True if event-based timing is enabled. Always false for SYCL. */
+    bool bDoTime = false;
+    /*! \brief Dummy timers. */
+    Nbnxm::gpu_timers_t* timers = nullptr;
+    /*! \brief Dummy timing data. */
+    gmx_wallclock_gpu_nbnxn_t* timings = nullptr;
+
+    //! true when a pair-list transfer has been done at this step
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> didPairlistH2D = { { false } };
+    //! true when we we did pruning on this step
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> didPrune = { { false } };
+    //! true when we did rolling pruning (at the previous step)
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> didRollingPrune = { { false } };
+
+    /*! \brief Events used for synchronization. Would be deprecated in SYCL. */
+    /*! \{ */
+    /*! \brief Event triggered when the non-local non-bonded
+     * kernel is done (and the local transfer can proceed) */
+    GpuEventSynchronizer nonlocal_done;
+    /*! \brief Event triggered when the tasks issued in the local
+     * stream that need to precede the non-local force or buffer
+     * operation calculations are done (e.g. f buffer 0-ing, local
+     * x/q H2D, buffer op initialization in local stream that is
+     * required also by nonlocal stream ) */
+    GpuEventSynchronizer misc_ops_and_local_H2D_done;
+    /*! \} */
+
+    /*! \brief True if there is work for the current domain in the
+     * respective locality.
+     *
+     * This includes local/nonlocal GPU work, either bonded or
+     * nonbonded, scheduled to be executed in the current
+     * domain. As long as bonded work is not split up into
+     * local/nonlocal, if there is bonded GPU work, both flags
+     * will be true. */
+    gmx::EnumerationArray<Nbnxm::InteractionLocality, bool> haveWork = { { false } };
+
+    /*! \brief Pointer to event synchronizer triggered when the local
+     * GPU buffer ops / reduction is complete. Would be deprecated in SYCL.
+     *
+     * \note That the synchronizer is managed outside of this module
+     * in StatePropagatorDataGpu.
+     */
+    GpuEventSynchronizer* localFReductionDone = nullptr;
+
+    /*! \brief Event triggered when non-local coordinate buffer
+     * has been copied from device to host. Would be deprecated in SYCL. */
+    GpuEventSynchronizer* xNonLocalCopyD2HDone = nullptr;
+};
+
+#endif /* NBNXM_SYCL_TYPES_H */
diff --git a/src/testutils/TestMacros.cmake b/src/testutils/TestMacros.cmake
index 01e94e9cd6..bdd991af4b 100644
--- a/src/testutils/TestMacros.cmake
+++ b/src/testutils/TestMacros.cmake
@@ -214,7 +214,7 @@ function (gmx_register_gtest_test NAME EXENAME)
             # Both OpenCL (from JIT) and ThreadSanitizer (from how it
             # checks) can take signficantly more time than other
             # configurations.
-            if (GMX_GPU_OPENCL)
+            if (GMX_GPU_OPENCL OR GMX_GPU_SYCL)
                 set(_timeout 240)
             elseif (${CMAKE_BUILD_TYPE} STREQUAL TSAN)
                 set(_timeout 300)