From: Aleksei Iupinov <a.yupinov@gmail.com>
Date: Mon, 19 Feb 2018 18:25:34 +0000 (+0100)
Subject: prepareGpuKernelArguments() and launchGpuKernel() are added
X-Git-Url: http://biod.pnpi.spb.ru/gitweb/?a=commitdiff_plain;h=58ec6661f1e59ce55c0e525400a0aa2aa8888106;p=alexxy%2Fgromacs.git

prepareGpuKernelArguments() and launchGpuKernel() are added

Template functions are added for preparing/packing GPU kernel
arguments and launching GPU kernels. This allows to hide
CUDA/OpenCL boilerplate. launchGpuKernel() throws InternalError
on failure. GPU kernels scheduling details are unified into
a KernelLaunchConfig struct. The new functions are applied
in NB as well as PME (facilitating PME OpenCL porting).

Change-Id: I90205cf1cda93c377a342b7f1c46a8caf239ba65
---

diff --git a/src/gromacs/ewald/pme-gather.cu b/src/gromacs/ewald/pme-gather.cu
index 7dc096fc24..71d1687926 100644
--- a/src/gromacs/ewald/pme-gather.cu
+++ b/src/gromacs/ewald/pme-gather.cu
@@ -428,7 +428,6 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
         pme_gpu_copy_input_forces(pmeGpu);
     }
 
-    cudaStream_t stream          = pmeGpu->archSpecific->pmeStream;
     const int    order           = pmeGpu->common->pme_order;
     const auto  *kernelParamsPtr = pmeGpu->kernelParams.get();
 
@@ -445,35 +444,38 @@ void pme_gpu_gather(PmeGpu                *pmeGpu,
     const int atomsPerBlock  =  (c_gatherMaxThreadsPerBlock / PME_SPREADGATHER_THREADS_PER_ATOM);
     GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. gathering block size");
 
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
-    auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
-    dim3 dimBlock(order, order, atomsPerBlock);
+    const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
 
-    const bool wrapX = true;
-    const bool wrapY = true;
+    KernelLaunchConfig config;
+    config.blockSize[0] = config.blockSize[1] = order;
+    config.blockSize[2] = atomsPerBlock;
+    config.gridSize[0]  = dimGrid.x;
+    config.gridSize[1]  = dimGrid.y;
+    config.stream       = pmeGpu->archSpecific->pmeStream;
+
+    if (order != 4)
+    {
+        GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
+    }
+
+    constexpr bool wrapX = true;
+    constexpr bool wrapY = true;
     GMX_UNUSED_VALUE(wrapX);
     GMX_UNUSED_VALUE(wrapY);
 
     // TODO test different cache configs
 
-    pme_gpu_start_timing(pmeGpu, gtPME_GATHER);
-    if (order == 4)
-    {
-        if (forceTreatment == PmeForceOutputHandling::Set)
-        {
-            pme_gather_kernel<4, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
-        }
-        else
-        {
-            pme_gather_kernel<4, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
-        }
-    }
-    else
-    {
-        GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 is not implemented"));
-    }
-    CU_LAUNCH_ERR("pme_gather_kernel");
-    pme_gpu_stop_timing(pmeGpu, gtPME_GATHER);
+    int  timingId = gtPME_GATHER;
+    void (*kernelPtr)(const PmeGpuCudaKernelParams) = (forceTreatment == PmeForceOutputHandling::Set) ?
+        pme_gather_kernel<4, true, wrapX, wrapY> :
+        pme_gather_kernel<4, false, wrapX, wrapY>;
+
+    pme_gpu_start_timing(pmeGpu, timingId);
+    auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+    const auto kernelArgs  = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+    launchGpuKernel(kernelPtr, config, timingEvent, "PME gather", kernelArgs);
+    pme_gpu_stop_timing(pmeGpu, timingId);
 
     pme_gpu_copy_output_forces(pmeGpu);
 }
diff --git a/src/gromacs/ewald/pme-solve.cu b/src/gromacs/ewald/pme-solve.cu
index f0f55df576..6a035dd410 100644
--- a/src/gromacs/ewald/pme-solve.cu
+++ b/src/gromacs/ewald/pme-solve.cu
@@ -445,37 +445,36 @@ void pme_gpu_solve(const PmeGpu *pmeGpu, t_complex *h_grid,
     const int blocksPerGridLine = (gridLineSize + maxBlockSize - 1) / maxBlockSize;
     const int cellsPerBlock     = gridLineSize * gridLinesPerBlock;
     const int blockSize         = (cellsPerBlock + warp_size - 1) / warp_size * warp_size;
+
+
+    KernelLaunchConfig config;
+    config.blockSize[0] = blockSize;
+    config.gridSize[0]  = blocksPerGridLine;
     // rounding up to full warps so that shuffle operations produce defined results
-    dim3 threads(blockSize);
-    dim3 blocks(blocksPerGridLine,
-                (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1) / gridLinesPerBlock,
-                pmeGpu->kernelParams->grid.complexGridSize[majorDim]);
+    config.gridSize[1]  = (pmeGpu->kernelParams->grid.complexGridSize[middleDim] + gridLinesPerBlock - 1) / gridLinesPerBlock;
+    config.gridSize[2]  = pmeGpu->kernelParams->grid.complexGridSize[majorDim];
+    config.stream       = pmeGpu->archSpecific->pmeStream;
 
-    pme_gpu_start_timing(pmeGpu, gtPME_SOLVE);
+    int  timingId = gtPME_SOLVE;
+    void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
     if (gridOrdering == GridOrdering::YZX)
     {
-        if (computeEnergyAndVirial)
-        {
-            pme_solve_kernel<GridOrdering::YZX, true> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
-        }
-        else
-        {
-            pme_solve_kernel<GridOrdering::YZX, false> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
-        }
+        kernelPtr = computeEnergyAndVirial ?
+            pme_solve_kernel<GridOrdering::YZX, true> :
+            pme_solve_kernel<GridOrdering::YZX, false>;
     }
     else if (gridOrdering == GridOrdering::XYZ)
     {
-        if (computeEnergyAndVirial)
-        {
-            pme_solve_kernel<GridOrdering::XYZ, true> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
-        }
-        else
-        {
-            pme_solve_kernel<GridOrdering::XYZ, false> <<< blocks, threads, 0, stream>>> (*kernelParamsPtr);
-        }
+        kernelPtr = computeEnergyAndVirial ?
+            pme_solve_kernel<GridOrdering::XYZ, true> :
+            pme_solve_kernel<GridOrdering::XYZ, false>;
     }
-    CU_LAUNCH_ERR("pme_solve_kernel");
-    pme_gpu_stop_timing(pmeGpu, gtPME_SOLVE);
+
+    pme_gpu_start_timing(pmeGpu, timingId);
+    auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+    const auto kernelArgs  = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+    launchGpuKernel(kernelPtr, config, timingEvent, "PME solve", kernelArgs);
+    pme_gpu_stop_timing(pmeGpu, timingId);
 
     if (computeEnergyAndVirial)
     {
diff --git a/src/gromacs/ewald/pme-spread.cu b/src/gromacs/ewald/pme-spread.cu
index 6574894146..89df2c77f9 100644
--- a/src/gromacs/ewald/pme-spread.cu
+++ b/src/gromacs/ewald/pme-spread.cu
@@ -489,7 +489,6 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
                     bool             spreadCharges)
 {
     GMX_ASSERT(computeSplines || spreadCharges, "PME spline/spread kernel has invalid input (nothing to do)");
-    cudaStream_t  stream          = pmeGpu->archSpecific->pmeStream;
     const auto   *kernelParamsPtr = pmeGpu->kernelParams.get();
     GMX_ASSERT(kernelParamsPtr->atoms.nAtoms > 0, "No atom data in PME GPU spread");
 
@@ -503,50 +502,53 @@ void pme_gpu_spread(const PmeGpu    *pmeGpu,
     //(for spline data mostly, together with varying PME_GPU_PARALLEL_SPLINE define)
     GMX_ASSERT(!c_usePadding || !(PME_ATOM_DATA_ALIGNMENT % atomsPerBlock), "inconsistent atom data padding vs. spreading block size");
 
-    const int blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
-    auto      dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
-    dim3 dimBlock(order, order, atomsPerBlock);
+    const int          blockCount = pmeGpu->nAtomsPadded / atomsPerBlock;
+    auto               dimGrid    = pmeGpuCreateGrid(pmeGpu, blockCount);
+
+    KernelLaunchConfig config;
+    config.blockSize[0] = config.blockSize[1] = order;
+    config.blockSize[2] = atomsPerBlock;
+    config.gridSize[0]  = dimGrid.x;
+    config.gridSize[1]  = dimGrid.y;
+    config.stream       = pmeGpu->archSpecific->pmeStream;
+
+    if (order != 4)
+    {
+        GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not implemented!"));
+    }
 
     // These should later check for PME decomposition
-    const bool wrapX = true;
-    const bool wrapY = true;
+    constexpr bool wrapX = true;
+    constexpr bool wrapY = true;
     GMX_UNUSED_VALUE(wrapX);
     GMX_UNUSED_VALUE(wrapY);
-    switch (order)
+
+    int  timingId;
+    void (*kernelPtr)(const PmeGpuCudaKernelParams) = nullptr;
+    if (computeSplines)
     {
-        case 4:
+        if (spreadCharges)
         {
-            // TODO: cleaner unroll with some template trick?
-            if (computeSplines)
-            {
-                if (spreadCharges)
-                {
-                    pme_gpu_start_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
-                    pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
-                    CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
-                    pme_gpu_stop_timing(pmeGpu, gtPME_SPLINEANDSPREAD);
-                }
-                else
-                {
-                    pme_gpu_start_timing(pmeGpu, gtPME_SPLINE);
-                    pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
-                    CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
-                    pme_gpu_stop_timing(pmeGpu, gtPME_SPLINE);
-                }
-            }
-            else
-            {
-                pme_gpu_start_timing(pmeGpu, gtPME_SPREAD);
-                pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY> <<< dimGrid, dimBlock, 0, stream>>> (*kernelParamsPtr);
-                CU_LAUNCH_ERR("pme_spline_and_spread_kernel");
-                pme_gpu_stop_timing(pmeGpu, gtPME_SPREAD);
-            }
+            timingId  = gtPME_SPLINEANDSPREAD;
+            kernelPtr = pme_spline_and_spread_kernel<4, true, true, wrapX, wrapY>;
+        }
+        else
+        {
+            timingId  = gtPME_SPLINE;
+            kernelPtr = pme_spline_and_spread_kernel<4, true, false, wrapX, wrapY>;
         }
-        break;
-
-        default:
-            GMX_THROW(gmx::NotImplementedError("The code for pme_order != 4 was not tested!"));
     }
+    else
+    {
+        timingId  = gtPME_SPREAD;
+        kernelPtr = pme_spline_and_spread_kernel<4, false, true, wrapX, wrapY>;
+    }
+
+    pme_gpu_start_timing(pmeGpu, timingId);
+    auto      *timingEvent = pme_gpu_fetch_timing_event(pmeGpu, timingId);
+    const auto kernelArgs  = prepareGpuKernelArguments(kernelPtr, config, kernelParamsPtr);
+    launchGpuKernel(kernelPtr, config, timingEvent, "PME spline/spread", kernelArgs);
+    pme_gpu_stop_timing(pmeGpu, timingId);
 
     const bool copyBackGrid = spreadCharges && (pme_gpu_is_testing(pmeGpu) || !pme_gpu_performs_FFT(pmeGpu));
     if (copyBackGrid)
diff --git a/src/gromacs/ewald/pme-timings.cu b/src/gromacs/ewald/pme-timings.cu
index f806e803a7..306ad2392b 100644
--- a/src/gromacs/ewald/pme-timings.cu
+++ b/src/gromacs/ewald/pme-timings.cu
@@ -67,6 +67,17 @@ void pme_gpu_start_timing(const PmeGpu *pmeGpu, size_t PMEStageId)
     }
 }
 
+CommandEvent *pme_gpu_fetch_timing_event(const PmeGpu *pmeGpu, size_t PMEStageId)
+{
+    CommandEvent *timingEvent = nullptr;
+    if (pme_gpu_timings_enabled(pmeGpu))
+    {
+        GMX_ASSERT(PMEStageId < pmeGpu->archSpecific->timingEvents.size(), "Wrong PME GPU timing event index");
+        timingEvent = pmeGpu->archSpecific->timingEvents[PMEStageId].fetchNextEvent();
+    }
+    return timingEvent;
+}
+
 void pme_gpu_stop_timing(const PmeGpu *pmeGpu, size_t PMEStageId)
 {
     if (pme_gpu_timings_enabled(pmeGpu))
diff --git a/src/gromacs/ewald/pme-timings.cuh b/src/gromacs/ewald/pme-timings.cuh
index 61dcba58d4..4df40b17e5 100644
--- a/src/gromacs/ewald/pme-timings.cuh
+++ b/src/gromacs/ewald/pme-timings.cuh
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2018, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -55,6 +55,15 @@ struct PmeGpu;
  */
 void pme_gpu_start_timing(const PmeGpu *pmeGpu, size_t PMEStageId);
 
+/*! \libinternal \brief
+ * Returns raw timing event from the corresponding GpuRegionTimer (if timings are enabled).
+ * In CUDA result can be nullptr stub, per GpuRegionTimer implementation.
+ *
+ * \param[in] pmeGpu         The PME GPU data structure.
+ * \param[in] PMEStageId     The PME GPU stage gtPME_ index from the enum in src/gromacs/timing/gpu_timing.h
+ */
+CommandEvent *pme_gpu_fetch_timing_event(const PmeGpu *pmeGpu, size_t PMEStageId);
+
 /*! \libinternal \brief
  * Stops timing the certain PME GPU stage during a single computation (if timings are enabled).
  *
diff --git a/src/gromacs/gpu_utils/cudautils.cuh b/src/gromacs/gpu_utils/cudautils.cuh
index 945d7bb1d7..6eabc3ce14 100644
--- a/src/gromacs/gpu_utils/cudautils.cuh
+++ b/src/gromacs/gpu_utils/cudautils.cuh
@@ -42,10 +42,13 @@
 #include <nvml.h>
 #endif /* HAVE_NVML */
 
+#include <array>
 #include <string>
 
+#include "gromacs/gpu_utils/gputraits.cuh"
 #include "gromacs/math/vec.h"
 #include "gromacs/math/vectypes.h"
+#include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/fatalerror.h"
 #include "gromacs/utility/gmxassert.h"
 #include "gromacs/utility/stringutil.h"
@@ -123,32 +126,10 @@ enum class GpuApiCallBehavior;
         } \
     } while (0)
 
-/*! Check for any previously occurred uncaught CUDA error
-   -- aimed at use after kernel calls. */
-#define CU_LAUNCH_ERR(msg) \
-    do { \
-        cudaError_t _CU_LAUNCH_ERR_status = cudaGetLastError(); \
-        if (_CU_LAUNCH_ERR_status != cudaSuccess) { \
-            gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_LAUNCH_ERR_status)); \
-        } \
-    } while (0)
-
-/*! Synchronize with GPU and check for any previously occurred uncaught CUDA error
-   -- aimed at use after kernel calls. */
-#define CU_LAUNCH_ERR_SYNC(msg) \
-    do { \
-        cudaError_t _CU_SYNC_LAUNCH_ERR_status = cudaThreadSynchronize(); \
-        if (_CU_SYNC_LAUNCH_ERR_status != cudaSuccess) { \
-            gmx_fatal(FARGS, "Error while launching kernel %s: %s\n", msg, cudaGetErrorString(_CU_SYNC_LAUNCH_ERR_status)); \
-        } \
-    } while (0)
-
 #else /* CHECK_CUDA_ERRORS */
 
 #define CU_RET_ERR(status, msg) do { } while (0)
 #define CU_CHECK_PREV_ERR()     do { } while (0)
-#define CU_LAUNCH_ERR(msg)      do { } while (0)
-#define CU_LAUNCH_ERR_SYNC(msg) do { } while (0)
 #define HANDLE_NVML_RET_ERR(status, msg) do { } while (0)
 
 #endif /* CHECK_CUDA_ERRORS */
@@ -286,4 +267,97 @@ static inline bool haveStreamTasksCompleted(cudaStream_t s)
     return true;
 }
 
+/* Kernel launch helpers */
+
+/*! \brief
+ * A function for setting up a single CUDA kernel argument.
+ * This is the tail of the compile-time recursive function below.
+ * It has to be seen by the compiler first.
+ *
+ * \tparam        totalArgsCount  Number of the kernel arguments
+ * \tparam        KernelPtr       Kernel function handle type
+ * \param[in]     argIndex        Index of the current argument
+ */
+template <size_t totalArgsCount, typename KernelPtr>
+void prepareGpuKernelArgument(KernelPtr /*kernel*/,
+                              std::array<void *, totalArgsCount> */* kernelArgsPtr */,
+                              size_t gmx_used_in_debug argIndex)
+{
+    GMX_ASSERT(argIndex == totalArgsCount, "Tail expansion");
+}
+
+/*! \brief
+ * Compile-time recursive function for setting up a single CUDA kernel argument.
+ * This function copies a kernel argument pointer \p argPtr into \p kernelArgsPtr,
+ * and calls itself on the next argument, eventually calling the tail function above.
+ *
+ * \tparam        CurrentArg      Type of the current argument
+ * \tparam        RemainingArgs   Types of remaining arguments after the current one
+ * \tparam        totalArgsCount  Number of the kernel arguments
+ * \tparam        KernelPtr       Kernel function handle type
+ * \param[in]     kernel          Kernel function handle
+ * \param[in,out] kernelArgsPtr   Pointer to the argument array to be filled in
+ * \param[in]     argIndex        Index of the current argument
+ * \param[in]     argPtr          Pointer to the current argument
+ * \param[in]     otherArgsPtrs   Pack of pointers to arguments remaining to process after the current one
+ */
+template <typename CurrentArg, typename ... RemainingArgs, size_t totalArgsCount, typename KernelPtr>
+void prepareGpuKernelArgument(KernelPtr kernel,
+                              std::array<void *, totalArgsCount> *kernelArgsPtr,
+                              size_t argIndex,
+                              const CurrentArg *argPtr,
+                              const RemainingArgs *... otherArgsPtrs)
+{
+    (*kernelArgsPtr)[argIndex] = (void *)argPtr;
+    prepareGpuKernelArgument(kernel, kernelArgsPtr, argIndex + 1, otherArgsPtrs ...);
+}
+
+/*! \brief
+ * A wrapper function for setting up all the CUDA kernel arguments.
+ * Calls the recursive functions above.
+ *
+ * \tparam    Args            Types of all the kernel arguments
+ * \param[in] kernel          Kernel function handle
+ * \param[in] argsPtrs        Pointers to all the kernel arguments
+ * \returns A prepared parameter pack to be used with launchGpuKernel() as the last argument.
+ */
+template <typename ... Args>
+std::array<void *, sizeof ... (Args)> prepareGpuKernelArguments(void                     (*kernel)(Args...),
+                                                                const KernelLaunchConfig & /*config */,
+                                                                const Args *...          argsPtrs)
+{
+    std::array<void *, sizeof ... (Args)> kernelArgs;
+    prepareGpuKernelArgument(kernel, &kernelArgs, 0, argsPtrs ...);
+    return kernelArgs;
+}
+
+/*! \brief Launches the CUDA kernel and handles the errors.
+ *
+ * \tparam    Args            Types of all the kernel arguments
+ * \param[in] kernel          Kernel function handle
+ * \param[in] config          Kernel configuration for launching
+ * \param[in] kernelName      Human readable kernel description, for error handling only
+ * \param[in] kernelArgs      Array of the pointers to the kernel arguments, prepared by prepareGpuKernelArguments()
+ * \throws gmx::InternalError on kernel launch failure
+ */
+template <typename... Args>
+void launchGpuKernel(void                                       (*kernel)(Args...),
+                     const KernelLaunchConfig                    &config,
+                     CommandEvent                                */*timingEvent */,
+                     const char                                  *kernelName,
+                     const std::array<void *, sizeof ... (Args)> &kernelArgs)
+{
+    dim3 blockSize(config.blockSize[0], config.blockSize[1], config.blockSize[2]);
+    dim3 gridSize(config.gridSize[0], config.gridSize[1], config.gridSize[2]);
+    cudaLaunchKernel((void *)kernel, gridSize, blockSize, const_cast<void **>(kernelArgs.data()), config.sharedMemorySize, config.stream);
+
+    cudaError_t status = cudaGetLastError();
+    if (cudaSuccess != status)
+    {
+        const std::string errorMessage = "GPU kernel (" +  std::string(kernelName) +
+            ") failed to launch: " + std::string(cudaGetErrorString(status));
+        GMX_THROW(gmx::InternalError(errorMessage));
+    }
+}
+
 #endif
diff --git a/src/gromacs/gpu_utils/gpu_utils.cu b/src/gromacs/gpu_utils/gpu_utils.cu
index c5e367884d..4ed5cbefc1 100644
--- a/src/gromacs/gpu_utils/gpu_utils.cu
+++ b/src/gromacs/gpu_utils/gpu_utils.cu
@@ -254,7 +254,10 @@ static int do_sanity_checks(int dev_id, cudaDeviceProp *dev_prop)
     }
 
     /* try to execute a dummy kernel */
-    k_dummy_test<<< 1, 512>>> ();
+    KernelLaunchConfig config;
+    config.blockSize[0] = 512;
+    const auto         dummyArguments = prepareGpuKernelArguments(k_dummy_test, config);
+    launchGpuKernel(k_dummy_test, config, nullptr, "Dummy kernel", dummyArguments);
     if (cudaThreadSynchronize() != cudaSuccess)
     {
         return -1;
diff --git a/src/gromacs/gpu_utils/gputraits.cuh b/src/gromacs/gpu_utils/gputraits.cuh
index 323f80eb9a..a4406e1943 100644
--- a/src/gromacs/gpu_utils/gputraits.cuh
+++ b/src/gromacs/gpu_utils/gputraits.cuh
@@ -49,4 +49,17 @@ using CommandEvent  = void;
 //! \brief Context used explicitly in OpenCL, does nothing in CUDA
 using Context       = void *;
 
+/*! \internal \brief
+ * GPU kernels scheduling description. This is same in OpenCL/CUDA.
+ * Provides reasonable defaults, one typically only needs to set the GPU stream
+ * and non-1 work sizes.
+ */
+struct KernelLaunchConfig
+{
+    size_t        gridSize[3]      = {1, 1, 1}; //!< Block counts
+    size_t        blockSize[3]     = {1, 1, 1}; //!< Per-block thread counts
+    size_t        sharedMemorySize = 0;         //!< Shared memory size in bytes
+    CommandStream stream           = nullptr;   //!< Stream to launch kernel in
+};
+
 #endif
diff --git a/src/gromacs/gpu_utils/gputraits_ocl.h b/src/gromacs/gpu_utils/gputraits_ocl.h
index adf6f9508f..0a2301e38f 100644
--- a/src/gromacs/gpu_utils/gputraits_ocl.h
+++ b/src/gromacs/gpu_utils/gputraits_ocl.h
@@ -51,4 +51,17 @@ using CommandEvent  = cl_event;
 //! \brief Context used explicitly in OpenCL
 using Context       = cl_context;
 
+/*! \internal \brief
+ * GPU kernels scheduling description. This is same in OpenCL/CUDA.
+ * Provides reasonable defaults, one typically only needs to set the GPU stream
+ * and non-1 work sizes.
+ */
+struct KernelLaunchConfig
+{
+    size_t        gridSize[3]      = {1, 1, 1}; //!< Work groups (CUDA blocks) counts
+    size_t        blockSize[3]     = {1, 1, 1}; //!< Per work group (CUDA block) thread counts
+    size_t        sharedMemorySize = 0;         //!< Shared memory size in bytes
+    CommandStream stream           = nullptr;   //!< Stream to launch kernel in
+};
+
 #endif
diff --git a/src/gromacs/gpu_utils/oclutils.h b/src/gromacs/gpu_utils/oclutils.h
index afe3238875..ef7de9aa54 100644
--- a/src/gromacs/gpu_utils/oclutils.h
+++ b/src/gromacs/gpu_utils/oclutils.h
@@ -44,6 +44,8 @@
 #include <string>
 
 #include "gromacs/gpu_utils/gmxopencl.h"
+#include "gromacs/gpu_utils/gputraits_ocl.h"
+#include "gromacs/utility/exceptions.h"
 #include "gromacs/utility/gmxassert.h"
 
 enum class GpuApiCallBehavior;
@@ -177,4 +179,107 @@ static inline bool haveStreamTasksCompleted(cl_command_queue gmx_unused s)
     return false;
 }
 
+/* Kernel launch helpers */
+
+/*! \brief
+ * A function for setting up a single OpenCL kernel argument.
+ * This is the tail of the compile-time recursive function below.
+ * It has to be seen by the compiler first.
+ * As NB kernels might be using dynamic local memory as the last argument,
+ * this function also manages that, using sharedMemorySize from \p config.
+ *
+ * \param[in]     kernel          Kernel function handle
+ * \param[in]     config          Kernel configuration for launching
+ * \param[in]     argIndex        Index of the current argument
+ */
+void inline prepareGpuKernelArgument(cl_kernel                 kernel,
+                                     const KernelLaunchConfig &config,
+                                     size_t                    argIndex)
+{
+    if (config.sharedMemorySize > 0)
+    {
+        cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, config.sharedMemorySize, nullptr);
+        GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
+    }
+}
+
+/*! \brief
+ * Compile-time recursive function for setting up a single OpenCL kernel argument.
+ * This function uses one kernel argument pointer \p argPtr to call clSetKernelArg(),
+ * and calls itself on the next argument, eventually calling the tail function above.
+ *
+ * \tparam        CurrentArg      Type of the current argument
+ * \tparam        RemainingArgs   Types of remaining arguments after the current one
+ * \param[in]     kernel          Kernel function handle
+ * \param[in]     config          Kernel configuration for launching
+ * \param[in]     argIndex        Index of the current argument
+ * \param[in]     argPtr          Pointer to the current argument
+ * \param[in]     otherArgsPtrs   Pack of pointers to arguments remaining to process after the current one
+ */
+template <typename CurrentArg, typename ... RemainingArgs>
+void prepareGpuKernelArgument(cl_kernel                 kernel,
+                              const KernelLaunchConfig &config,
+                              size_t                    argIndex,
+                              const CurrentArg         *argPtr,
+                              const RemainingArgs *...  otherArgsPtrs)
+{
+    cl_int gmx_used_in_debug clError = clSetKernelArg(kernel, argIndex, sizeof(CurrentArg), argPtr);
+    GMX_ASSERT(CL_SUCCESS == clError, ocl_get_error_string(clError).c_str());
+
+    prepareGpuKernelArgument(kernel, config, argIndex + 1, otherArgsPtrs ...);
+}
+
+/*! \brief
+ * A wrapper function for setting up all the OpenCL kernel arguments.
+ * Calls the recursive functions above.
+ *
+ * \tparam    Args            Types of all the kernel arguments
+ * \param[in] kernel          Kernel function handle
+ * \param[in] config          Kernel configuration for launching
+ * \param[in] argsPtrs        Pointers to all the kernel arguments
+ * \returns A handle for the prepared parameter pack to be used with launchGpuKernel() as the last argument
+ * - currently always nullptr for OpenCL, as it manages kernel/arguments association by itself.
+ */
+template <typename ... Args>
+void *prepareGpuKernelArguments(cl_kernel                 kernel,
+                                const KernelLaunchConfig &config,
+                                const Args *...           argsPtrs)
+{
+    prepareGpuKernelArgument(kernel, config, 0, argsPtrs ...);
+    return nullptr;
+}
+
+/*! \brief Launches the OpenCL kernel and handles the errors.
+ *
+ * \param[in] kernel          Kernel function handle
+ * \param[in] config          Kernel configuration for launching
+ * \param[in] timingEvent     Timing event, fetched from GpuRegionTimer
+ * \param[in] kernelName      Human readable kernel description, for error handling only
+ * \throws gmx::InternalError on kernel launch failure
+ */
+inline void launchGpuKernel(cl_kernel                 kernel,
+                            const KernelLaunchConfig &config,
+                            CommandEvent             *timingEvent,
+                            const char               *kernelName,
+                            const void                * /*kernelArgs*/)
+{
+    const int       workDimensions    = 3;
+    const size_t   *globalWorkOffset  = nullptr;
+    const size_t    waitListSize      = 0;
+    const cl_event *waitList          = nullptr;
+    size_t          globalWorkSize[3];
+    for (int i = 0; i < workDimensions; i++)
+    {
+        globalWorkSize[i] = config.gridSize[i] * config.blockSize[i];
+    }
+    cl_int clError = clEnqueueNDRangeKernel(config.stream, kernel, workDimensions, globalWorkOffset,
+                                            globalWorkSize, config.blockSize, waitListSize, waitList, timingEvent);
+    if (CL_SUCCESS != clError)
+    {
+        const std::string errorMessage = "GPU kernel (" +  std::string(kernelName) +
+            ") failed to launch: " + ocl_get_error_string(clError);
+        GMX_THROW(gmx::InternalError(errorMessage));
+    }
+}
+
 #endif
diff --git a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
index e18c197235..ecfccb8621 100644
--- a/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
+++ b/src/gromacs/mdlib/nbnxn_cuda/nbnxn_cuda.cu
@@ -299,7 +299,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
     cudaError_t          stat;
     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
     /* CUDA kernel launch-related stuff */
-    int                  shmem, nblock;
+    int                  nblock;
     dim3                 dim_block, dim_grid;
     nbnxn_cu_kfunc_ptr_t nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
 
@@ -412,29 +412,29 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_cuda_t       *nb,
         num_threads_z = 2;
     }
     nblock    = calc_nb_kernel_nblock(plist->nsci, nb->dev_info);
-    dim_block = dim3(c_clSize, c_clSize, num_threads_z);
-    dim_grid  = dim3(nblock, 1, 1);
-    shmem     = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
+
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_clSize;
+    config.blockSize[1]     = c_clSize;
+    config.blockSize[2]     = num_threads_z;
+    config.gridSize[0]      = nblock;
+    config.sharedMemorySize = calc_shmem_required_nonbonded(num_threads_z, nb->dev_info, nbp);
+    config.stream           = stream;
 
     if (debug)
     {
-        fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %ux%ux%u\n\t"
-                "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
-                "\tShMem: %d\n",
-                dim_block.x, dim_block.y, dim_block.z,
-                dim_grid.x, dim_grid.y, plist->nsci*c_numClPerSupercl,
+        fprintf(debug, "Non-bonded GPU launch configuration:\n\tThread block: %zux%zux%zu\n\t"
+                "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+                "\tShMem: %zu\n",
+                config.blockSize[0], config.blockSize[1], config.blockSize[2],
+                config.gridSize[0], config.gridSize[1], plist->nsci*c_numClPerSupercl,
                 c_numClPerSupercl, plist->na_c,
-                shmem);
+                config.sharedMemorySize);
     }
 
-    void* kernel_args[4];
-    kernel_args[0] = adat;
-    kernel_args[1] = nbp;
-    kernel_args[2] = plist;
-    kernel_args[3] = &bCalcFshift;
-
-    cudaLaunchKernel((void *)nb_kernel, dim_grid, dim_block, kernel_args, shmem, stream);
-    CU_LAUNCH_ERR("k_calc_nb");
+    auto      *timingEvent = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+    const auto kernelArgs  = prepareGpuKernelArguments(nb_kernel, config, adat, nbp, plist, &bCalcFshift);
+    launchGpuKernel(nb_kernel, config, timingEvent, "k_calc_nb", kernelArgs);
 
     if (bDoTime)
     {
@@ -531,39 +531,32 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_cuda_t       *nb,
      *   and j-cluster concurrency, in x, y, and z, respectively.
      * - The 1D block-grid contains as many blocks as super-clusters.
      */
-    int  num_threads_z  = c_cudaPruneKernelJ4Concurrency;
-    int  nblock         = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
-    dim3 dim_block      = dim3(c_clSize, c_clSize, num_threads_z);
-    dim3 dim_grid       = dim3(nblock, 1, 1);
-    int  shmem          = calc_shmem_required_prune(num_threads_z);
+    int                num_threads_z  = c_cudaPruneKernelJ4Concurrency;
+    int                nblock         = calc_nb_kernel_nblock(numSciInPart, nb->dev_info);
+    KernelLaunchConfig config;
+    config.blockSize[0]     = c_clSize;
+    config.blockSize[1]     = c_clSize;
+    config.blockSize[2]     = num_threads_z;
+    config.gridSize[0]      = nblock;
+    config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
+    config.stream           = stream;
 
     if (debug)
     {
-        fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %ux%ux%u\n\t"
-                "\tGrid: %ux%u\n\t#Super-clusters/clusters: %d/%d (%d)\n"
-                "\tShMem: %d\n",
-                dim_block.x, dim_block.y, dim_block.z,
-                dim_grid.x, dim_grid.y, numSciInPart*c_numClPerSupercl,
+        fprintf(debug, "Pruning GPU kernel launch configuration:\n\tThread block: %zux%zux%zu\n\t"
+                "\tGrid: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
+                "\tShMem: %zu\n",
+                config.blockSize[0], config.blockSize[1], config.blockSize[2],
+                config.gridSize[0], config.gridSize[1], numSciInPart*c_numClPerSupercl,
                 c_numClPerSupercl, plist->na_c,
-                shmem);
+                config.sharedMemorySize);
     }
 
-    void* kernel_args[5];
-    kernel_args[0] = adat;
-    kernel_args[1] = nbp;
-    kernel_args[2] = plist;
-    kernel_args[3] = &numParts;
-    kernel_args[4] = &part;
-
-    if (plist->haveFreshList)
-    {
-        cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<true>, dim_grid, dim_block, kernel_args, shmem, stream);
-    }
-    else
-    {
-        cudaLaunchKernel((void *)nbnxn_kernel_prune_cuda<false>, dim_grid, dim_block, kernel_args, shmem, stream);
-    }
-    CU_LAUNCH_ERR("k_pruneonly");
+    auto          *timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
+    constexpr char kernelName[] = "k_pruneonly";
+    const auto    &kernel       = plist->haveFreshList ? nbnxn_kernel_prune_cuda<true> : nbnxn_kernel_prune_cuda<false>;
+    const auto     kernelArgs   = prepareGpuKernelArguments(kernel, config, adat, nbp, plist, &numParts, &part);
+    launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
 
     /* TODO: consider a more elegant way to track which kernel has been called
        (combined or separate 1st pass prune, rolling prune). */
diff --git a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
index 8f80c77526..af56d62424 100644
--- a/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
+++ b/src/gromacs/mdlib/nbnxn_ocl/nbnxn_ocl.cpp
@@ -99,13 +99,20 @@ static const int c_clSize          = c_nbnxnGpuClusterSize;
 
 /*! \brief Validates the input global work size parameter.
  */
-static inline void validate_global_work_size(size_t *global_work_size, int work_dim, const gmx_device_info_t *dinfo)
+static inline void validate_global_work_size(const KernelLaunchConfig &config, int work_dim, const gmx_device_info_t *dinfo)
 {
     cl_uint device_size_t_size_bits;
     cl_uint host_size_t_size_bits;
 
     assert(dinfo);
 
+    size_t global_work_size[3];
+    GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
+    for (int i = 0; i < work_dim; i++)
+    {
+        global_work_size[i] = config.blockSize[i] * config.gridSize[i];
+    }
+
     /* Each component of a global_work_size must not exceed the range given by the
        sizeof(device size_t) for the device on which the kernel execution will
        be enqueued. See:
@@ -376,12 +383,9 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
                              int                            flags,
                              int                            iloc)
 {
-    cl_int               cl_error;
     int                  adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
     /* OpenCL kernel launch-related stuff */
-    int                  shmem;
-    size_t               local_work_size[3], global_work_size[3];
-    cl_kernel            nb_kernel = NULL; /* fn pointer to the nonbonded kernel */
+    cl_kernel            nb_kernel = NULL;     /* fn pointer to the nonbonded kernel */
 
     cl_atomdata_t       *adat    = nb->atdat;
     cl_nbparam_t        *nbp     = nb->nbparam;
@@ -392,7 +396,6 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
     bool                 bCalcEner   = flags & GMX_FORCE_ENERGY;
     int                  bCalcFshift = flags & GMX_FORCE_VIRIAL;
     bool                 bDoTime     = nb->bDoTime;
-    cl_uint              arg_no;
 
     cl_nbparam_params_t  nbparams_params;
 
@@ -446,7 +449,7 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
     {
         if (iloc == eintLocal)
         {
-            cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
+            cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, NULL, &(nb->misc_ops_and_local_H2D_done));
             assert(CL_SUCCESS == cl_error);
 
             /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
@@ -493,67 +496,49 @@ void nbnxn_gpu_launch_kernel(gmx_nbnxn_ocl_t               *nb,
                                     (plist->haveFreshList && !nb->timers->didPrune[iloc]));
 
     /* kernel launch config */
-    local_work_size[0] = c_clSize;
-    local_work_size[1] = c_clSize;
-    local_work_size[2] = 1;
-
-    global_work_size[0] = plist->nsci * local_work_size[0];
-    global_work_size[1] = 1 * local_work_size[1];
-    global_work_size[2] = 1 * local_work_size[2];
 
-    validate_global_work_size(global_work_size, 3, nb->dev_info);
+    KernelLaunchConfig config;
+    config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
+    config.stream           = stream;
+    config.blockSize[0]     = c_clSize;
+    config.blockSize[1]     = c_clSize;
+    config.gridSize[0]      = plist->nsci;
 
-    shmem     = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
+    validate_global_work_size(config, 3, nb->dev_info);
 
     if (debug)
     {
         fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
                 "Global work size : %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n",
-                (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
-                (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
+                (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]),
+                (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl,
                 c_numClPerSupercl, plist->na_c);
     }
 
     fillin_ocl_structures(nbp, &nbparams_params);
 
-    arg_no    = 0;
-    cl_error  = CL_SUCCESS;
-    if (!useLjCombRule(nb->nbparam->vdwtype))
-    {
-        cl_error  = clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &(adat->ntypes));
-    }
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->xq));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->f));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_lj));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->e_el));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->fshift));
+    auto          *timingEvent  = bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr;
+    constexpr char kernelName[] = "k_calc_nb";
     if (useLjCombRule(nb->nbparam->vdwtype))
     {
-        cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->lj_comb));
+        const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config,
+                                                          &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
+                                                          &adat->lj_comb,
+                                                          &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
+                                                          &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift);
+
+        launchGpuKernel(nb_kernel, config, timingEvent, kernelName, kernelArgs);
     }
     else
     {
-        cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->atom_types));
+        const auto kernelArgs = prepareGpuKernelArguments(nb_kernel, config,
+                                                          &adat->ntypes,
+                                                          &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
+                                                          &adat->atom_types,
+                                                          &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
+                                                          &plist->sci, &plist->cj4, &plist->excl, &bCalcFshift);
+        launchGpuKernel(nb_kernel, config, timingEvent, kernelName, kernelArgs);
     }
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_climg2d));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->nbfp_comb_climg2d));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(nbp->coulomb_tab_climg2d));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->sci));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(cl_mem), &(plist->excl));
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, sizeof(int), &bCalcFshift);
-    cl_error |= clSetKernelArg(nb_kernel, arg_no++, shmem, NULL);
-
-    assert(cl_error == CL_SUCCESS);
-
-    if (cl_error)
-    {
-        printf("OpenCL error: %s\n", ocl_get_error_string(cl_error).c_str());
-    }
-    cl_error = clEnqueueNDRangeKernel(stream, nb_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, bDoTime ? t->nb_k[iloc].fetchNextEvent() : nullptr);
-    assert(cl_error == CL_SUCCESS);
 
     if (bDoTime)
     {
@@ -591,8 +576,6 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
                                        int                    iloc,
                                        int                    numParts)
 {
-    cl_int               cl_error;
-
     cl_atomdata_t       *adat    = nb->atdat;
     cl_nbparam_t        *nbp     = nb->nbparam;
     cl_plist_t          *plist   = nb->plist[iloc];
@@ -663,50 +646,35 @@ void nbnxn_gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t       *nb,
     cl_kernel pruneKernel   = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
 
     /* kernel launch config */
-    size_t  local_work_size[3], global_work_size[3];
-    local_work_size[0] = c_clSize;
-    local_work_size[1] = c_clSize;
-    local_work_size[2] = num_threads_z;
+    KernelLaunchConfig config;
+    config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
+    config.stream           = stream;
+    config.blockSize[0]     = c_clSize;
+    config.blockSize[1]     = c_clSize;
+    config.blockSize[2]     = num_threads_z;
+    config.gridSize[0]      = numSciInPart;
 
-    global_work_size[0] = numSciInPart * local_work_size[0];
-    global_work_size[1] = 1 * local_work_size[1];
-    global_work_size[2] = 1 * local_work_size[2];
-
-    validate_global_work_size(global_work_size, 3, nb->dev_info);
-
-    int shmem = calc_shmem_required_prune(num_threads_z);
+    validate_global_work_size(config, 3, nb->dev_info);
 
     if (debug)
     {
         fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %dx%dx%d\n\t"
                 "\tGlobal work size: %dx%d\n\t#Super-clusters/clusters: %d/%d (%d)\n"
-                "\tShMem: %d\n",
-                (int)(local_work_size[0]), (int)(local_work_size[1]), (int)(local_work_size[2]),
-                (int)(global_work_size[0]), (int)(global_work_size[1]), plist->nsci*c_numClPerSupercl,
-                c_numClPerSupercl, plist->na_c, shmem);
+                "\tShMem: %zu\n",
+                (int)(config.blockSize[0]), (int)(config.blockSize[1]), (int)(config.blockSize[2]),
+                (int)(config.blockSize[0] * config.gridSize[0]), (int)(config.blockSize[1] * config.gridSize[1]), plist->nsci*c_numClPerSupercl,
+                c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
     }
 
     cl_nbparam_params_t  nbparams_params;
     fillin_ocl_structures(nbp, &nbparams_params);
 
-    cl_uint  arg_no = 0;
-    cl_error = CL_SUCCESS;
-
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(nbparams_params), &(nbparams_params));
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->xq));
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(adat->shift_vec));
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->sci));
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->cj4));
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(cl_mem), &(plist->imask));
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(numParts));
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, sizeof(int), &(part));
-    cl_error |= clSetKernelArg(pruneKernel, arg_no++, shmem, nullptr);
-    assert(cl_error == CL_SUCCESS);
-
-    cl_error = clEnqueueNDRangeKernel(stream, pruneKernel, 3,
-                                      nullptr, global_work_size, local_work_size,
-                                      0, nullptr, bDoTime ? timer->fetchNextEvent() : nullptr);
-    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
+    auto          *timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
+    constexpr char kernelName[] = "k_pruneonly";
+    const auto     kernelArgs   = prepareGpuKernelArguments(pruneKernel, config,
+                                                            &nbparams_params, &adat->xq, &adat->shift_vec,
+                                                            &plist->sci, &plist->cj4, &plist->imask, &numParts, &part);
+    launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs);
 
     if (plist->haveFreshList)
     {