Fix random typos
[alexxy/gromacs.git] / src / gromacs / nbnxm / opencl / nbnxm_ocl.cpp
index affbad0811729cdd7ec9e0f4dd5a4ceac5d81cdf..b4b28c06526441dbc7f2b5e162a4a1d56bd4bb13 100644 (file)
@@ -2,7 +2,7 @@
  * This file is part of the GROMACS molecular simulation package.
  *
  * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -68,8 +68,6 @@
 #    include <limits>
 #endif
 
-#include "thread_mpi/atomic.h"
-
 #include "gromacs/gpu_utils/device_context.h"
 #include "gromacs/gpu_utils/gputraits_ocl.h"
 #include "gromacs/gpu_utils/oclutils.h"
@@ -83,7 +81,6 @@
 #include "gromacs/nbnxm/nbnxm.h"
 #include "gromacs/nbnxm/nbnxm_gpu.h"
 #include "gromacs/nbnxm/pairlist.h"
-#include "gromacs/pbcutil/ishift.h"
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/utility/cstringutil.h"
 #include "gromacs/utility/fatalerror.h"
@@ -147,7 +144,8 @@ static inline void validate_global_work_size(const KernelLaunchConfig& config,
                         "Watch out, the input system is too large to simulate!\n"
                         "The number of nonbonded work units (=number of super-clusters) exceeds the"
                         "device capabilities. Global work size limit exceeded (%zu > %zu)!",
-                        global_work_size[i], device_limit);
+                        global_work_size[i],
+                        device_limit);
             }
         }
     }
@@ -163,16 +161,24 @@ static inline void validate_global_work_size(const KernelLaunchConfig& config,
 
 /*! \brief Force-only kernel function names. */
 static const char* nb_kfunc_noener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
-    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
+    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
       "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
-    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
       "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
-    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
-      "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
       "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",
       "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",
       "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
@@ -183,31 +189,43 @@ static const char* nb_kfunc_noener_noprune_ptr[c_numElecTypes][c_numVdwTypes] =
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl",
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl",
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
       "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",
       "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",
       "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",
-      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
       "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",
       "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
 };
 
 /*! \brief Force + energy kernel function pointers. */
 static const char* nb_kfunc_ener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
-    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
+    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
       "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
-    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
+    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
       "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
-    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
       "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",
-      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
       "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",
       "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
     { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl",
@@ -217,9 +235,12 @@ static const char* nb_kfunc_ener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl",
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl",
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
+    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
       "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",
       "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",
@@ -235,12 +256,15 @@ static const char* nb_kfunc_noener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
     { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",
       "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",
       "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
       "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",
       "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
-    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
       "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
       "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",
       "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",
@@ -257,9 +281,11 @@ static const char* nb_kfunc_noener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl",
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl",
       "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
       "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
       "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",
       "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",
@@ -283,7 +309,8 @@ static const char* nb_kfunc_ener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
     { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",
       "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",
       "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
       "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",
       "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
     { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",
@@ -303,7 +330,8 @@ static const char* nb_kfunc_ener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
     { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",
       "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",
       "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
       "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",
       "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
     { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",
@@ -386,9 +414,10 @@ select_nbnxn_kernel(NbnxmGpu* nb, enum ElecType elecType, enum VdwType vdwType,
     if (nullptr == kernel_ptr[0])
     {
         *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
-        GMX_ASSERT(cl_error == CL_SUCCESS, ("clCreateKernel failed: " + ocl_get_error_string(cl_error)
-                                            + " for kernel named " + kernel_name_to_run)
-                                                   .c_str());
+        GMX_ASSERT(cl_error == CL_SUCCESS,
+                   ("clCreateKernel failed: " + ocl_get_error_string(cl_error)
+                    + " for kernel named " + kernel_name_to_run)
+                           .c_str());
     }
 
     return *kernel_ptr;
@@ -459,117 +488,6 @@ static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams
     nbparams_params->vdw_switch        = nbp->vdw_switch;
 }
 
-/*! \brief Enqueues a wait for event completion.
- *
- * Then it releases the event and sets it to 0.
- * Don't use this function when more than one wait will be issued for the event.
- * Equivalent to Cuda Stream Sync. */
-static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
-{
-    cl_int gmx_unused cl_error;
-
-    /* Enqueue wait */
-    cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
-    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
-
-    /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
-    cl_error = clReleaseEvent(*ocl_event);
-    GMX_ASSERT(cl_error == CL_SUCCESS,
-               ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
-    *ocl_event = nullptr;
-}
-
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
-    /* local/nonlocal offset and length used for xq and f */
-    int adat_begin, adat_len;
-
-    cl_atomdata_t*      adat         = nb->atdat;
-    gpu_plist*          plist        = nb->plist[iloc];
-    cl_timers_t*        t            = nb->timers;
-    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
-    bool bDoTime = nb->bDoTime;
-
-    /* Don't launch the non-local H2D copy if there is no dependent
-       work to do: neither non-local nor other (e.g. bonded) work
-       to do that has as input the nbnxn coordinates.
-       Doing the same for the local kernel is more complicated, since the
-       local part of the force array also depends on the non-local kernel.
-       So to avoid complicating the code and to reduce the risk of bugs,
-       we always call the local local x+q copy (and the rest of the local
-       work in nbnxn_gpu_launch_kernel().
-     */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        plist->haveFreshList = false;
-
-        return;
-    }
-
-    /* calculate the atom data index range based on locality */
-    if (atomLocality == AtomLocality::Local)
-    {
-        adat_begin = 0;
-        adat_len   = adat->natoms_local;
-    }
-    else
-    {
-        adat_begin = adat->natoms_local;
-        adat_len   = adat->natoms - adat->natoms_local;
-    }
-
-    /* beginning of timed HtoD section */
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
-    }
-
-    /* HtoD x, q */
-    GMX_ASSERT(sizeof(float) == sizeof(*nbatom->x().data()),
-               "The size of the xyzq buffer element should be equal to the size of float4.");
-    copyToDeviceBuffer(&adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin * 4, adat_len * 4,
-                       deviceStream, GpuApiCallBehavior::Async,
-                       bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
-    }
-
-    /* When we get here all misc operations issues in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one. */
-    if (nb->bUseTwoStreams)
-    {
-        if (iloc == InteractionLocality::Local)
-        {
-            cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
-                    deviceStream.stream(), 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-
-            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-             * in the local stream in order to be able to sync with the above event
-             * from the non-local stream.
-             */
-            cl_error = clFlush(deviceStream.stream());
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-        }
-        else
-        {
-            sync_ocl_event(deviceStream.stream(), &(nb->misc_ops_and_local_H2D_done));
-        }
-    }
-}
-
-
 /*! \brief Launch GPU kernel
 
    As we execute nonbonded workload in separate queues, before launching
@@ -590,10 +508,10 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
  */
 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
 {
-    cl_atomdata_t*      adat         = nb->atdat;
+    NBAtomDataGpu*      adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
-    cl_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
     bool bDoTime = nb->bDoTime;
@@ -635,7 +553,7 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     /* beginning of timed nonbonded calculation section */
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.openTimingRegion(deviceStream);
+        timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
     }
 
     /* kernel launch config */
@@ -653,18 +571,25 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
         fprintf(debug,
                 "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
                 "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
-                config.blockSize[0], config.blockSize[1], config.blockSize[2],
-                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
+                config.blockSize[0],
+                config.blockSize[1],
+                config.blockSize[2],
+                config.blockSize[0] * config.gridSize[0],
+                config.blockSize[1] * config.gridSize[1],
                 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
-                c_nbnxnGpuNumClusterPerSupercluster, plist->na_c);
+                c_nbnxnGpuNumClusterPerSupercluster,
+                plist->na_c);
     }
 
     fillin_ocl_structures(nbp, &nbparams_params);
 
-    auto*          timingEvent  = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+    auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
     constexpr char kernelName[] = "k_calc_nb";
     const auto     kernel =
-            select_nbnxn_kernel(nb, nbp->elecType, nbp->vdwType, stepWork.computeEnergy,
+            select_nbnxn_kernel(nb,
+                                nbp->elecType,
+                                nbp->vdwType,
+                                stepWork.computeEnergy,
                                 (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
 
 
@@ -673,25 +598,52 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     const int computeFshift = static_cast<int>(stepWork.computeVirial);
     if (useLjCombRule(nb->nbparam->vdwType))
     {
-        const auto kernelArgs = prepareGpuKernelArguments(
-                kernel, config, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el,
-                &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb,
-                &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
+        const auto kernelArgs = prepareGpuKernelArguments(kernel,
+                                                          config,
+                                                          &nbparams_params,
+                                                          &adat->xq,
+                                                          &adat->f,
+                                                          &adat->eLJ,
+                                                          &adat->eElec,
+                                                          &adat->fShift,
+                                                          &adat->ljComb,
+                                                          &adat->shiftVec,
+                                                          &nbp->nbfp,
+                                                          &nbp->nbfp_comb,
+                                                          &nbp->coulomb_tab,
+                                                          &plist->sci,
+                                                          &plist->cj4,
+                                                          &plist->excl,
+                                                          &computeFshift);
 
         launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
     }
     else
     {
-        const auto kernelArgs = prepareGpuKernelArguments(
-                kernel, config, &adat->ntypes, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj,
-                &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp, &nbp->nbfp_comb,
-                &nbp->coulomb_tab, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
+        const auto kernelArgs = prepareGpuKernelArguments(kernel,
+                                                          config,
+                                                          &adat->numTypes,
+                                                          &nbparams_params,
+                                                          &adat->xq,
+                                                          &adat->f,
+                                                          &adat->eLJ,
+                                                          &adat->eElec,
+                                                          &adat->fShift,
+                                                          &adat->atomTypes,
+                                                          &adat->shiftVec,
+                                                          &nbp->nbfp,
+                                                          &nbp->nbfp_comb,
+                                                          &nbp->coulomb_tab,
+                                                          &plist->sci,
+                                                          &plist->cj4,
+                                                          &plist->excl,
+                                                          &computeFshift);
         launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
     }
 
     if (bDoTime)
     {
-        t->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
+        timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
     }
 }
 
@@ -702,7 +654,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
  *  for OpenCL local memory.
  *
  * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
- * dimension. \returns   the amount of local memory in bytes required by the pruning kernel
+ * dimension.
+ * \returns   the amount of local memory in bytes required by the pruning kernel
  */
 static inline int calc_shmem_required_prune(const int num_threads_z)
 {
@@ -727,10 +680,10 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
  */
 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
 {
-    cl_atomdata_t*      adat         = nb->atdat;
+    NBAtomDataGpu*      adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
-    cl_timers_t*        t            = nb->timers;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
     bool                bDoTime      = nb->bDoTime;
 
@@ -780,7 +733,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     GpuRegionTimer* timer = nullptr;
     if (bDoTime)
     {
-        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+        timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+                                       : timers->interaction[iloc].rollingPrune_k);
     }
 
     /* beginning of timed prune calculation section */
@@ -794,9 +748,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      *   and j-cluster concurrency, in x, y, and z, respectively.
      * - The 1D block-grid contains as many blocks as super-clusters.
      */
-    int num_threads_z = c_oclPruneKernelJ4ConcurrencyDEFAULT;
-
-
+    int num_threads_z = c_pruneKernelJ4Concurrency;
     /* kernel launch config */
     KernelLaunchConfig config;
     config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
@@ -813,10 +765,15 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
                 "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
                 "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
                 "\tShMem: %zu\n",
-                config.blockSize[0], config.blockSize[1], config.blockSize[2],
-                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
+                config.blockSize[0],
+                config.blockSize[1],
+                config.blockSize[2],
+                config.blockSize[0] * config.gridSize[0],
+                config.blockSize[1] * config.gridSize[1],
                 plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
-                c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
+                c_nbnxnGpuNumClusterPerSupercluster,
+                plist->na_c,
+                config.sharedMemorySize);
     }
 
     cl_nbparam_params_t nbparams_params;
@@ -825,9 +782,16 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     auto*          timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
     constexpr char kernelName[] = "k_pruneonly";
     const auto     pruneKernel  = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
-    const auto     kernelArgs   = prepareGpuKernelArguments(pruneKernel, config, &nbparams_params,
-                                                      &adat->xq, &adat->shift_vec, &plist->sci,
-                                                      &plist->cj4, &plist->imask, &numParts, &part);
+    const auto     kernelArgs   = prepareGpuKernelArguments(pruneKernel,
+                                                      config,
+                                                      &nbparams_params,
+                                                      &adat->xq,
+                                                      &adat->shiftVec,
+                                                      &plist->sci,
+                                                      &plist->cj4,
+                                                      &plist->imask,
+                                                      &numParts,
+                                                      &part);
     launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
 
     if (plist->haveFreshList)
@@ -848,113 +812,4 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
     }
 }
 
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(NbnxmGpu*                nb,
-                        struct nbnxn_atomdata_t* nbatom,
-                        const gmx::StepWorkload& stepWork,
-                        const AtomLocality       aloc)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    cl_int gmx_unused cl_error;
-    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
-
-    /* determine interaction locality from atom locality */
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
-
-    cl_atomdata_t*      adat         = nb->atdat;
-    cl_timers_t*        t            = nb->timers;
-    bool                bDoTime      = nb->bDoTime;
-    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-
-    /* don't launch non-local copy-back if there was no non-local work to do */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        /* TODO An alternative way to signal that non-local work is
-           complete is to use a clEnqueueMarker+clEnqueueBarrier
-           pair. However, the use of bNonLocalStreamActive has the
-           advantage of being local to the host, so probably minimizes
-           overhead. Curiously, for NVIDIA OpenCL with an empty-domain
-           test case, overall simulation performance was higher with
-           the API calls, but this has not been tested on AMD OpenCL,
-           so could be worth considering in future. */
-        nb->bNonLocalStreamActive = CL_FALSE;
-        return;
-    }
-
-    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
-
-    /* beginning of timed D2H section */
-    if (bDoTime)
-    {
-        t->xf[aloc].nb_d2h.openTimingRegion(deviceStream);
-    }
-
-    /* With DD the local D2H transfer can only start after the non-local
-       has been launched. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
-    {
-        sync_ocl_event(deviceStream.stream(), &(nb->nonlocal_done));
-    }
-
-    /* DtoH f */
-    GMX_ASSERT(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
-               "The host force buffer should be in single precision to match device data size.");
-    copyFromDeviceBuffer(&nbatom->out[0].f[adat_begin * DIM], &adat->f, adat_begin * DIM,
-                         adat_len * DIM, deviceStream, GpuApiCallBehavior::Async,
-                         bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
-    /* kick off work */
-    cl_error = clFlush(deviceStream.stream());
-    GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-
-    /* After the non-local D2H is launched the nonlocal_done event can be
-       recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we first need the non-local
-       data back first. */
-    if (iloc == InteractionLocality::NonLocal)
-    {
-        cl_error = clEnqueueMarkerWithWaitList(deviceStream.stream(), 0, nullptr, &(nb->nonlocal_done));
-        GMX_ASSERT(cl_error == CL_SUCCESS,
-                   ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-        nb->bNonLocalStreamActive = CL_TRUE;
-    }
-
-    /* only transfer energies in the local stream */
-    if (iloc == InteractionLocality::Local)
-    {
-        /* DtoH fshift when virial is needed */
-        if (stepWork.computeVirial)
-        {
-            GMX_ASSERT(sizeof(*nb->nbst.fshift) == DIM * sizeof(float),
-                       "Sizes of host- and device-side shift vector elements should be the same.");
-            copyFromDeviceBuffer(reinterpret_cast<float*>(nb->nbst.fshift), &adat->fshift, 0,
-                                 SHIFTS * DIM, deviceStream, GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-        }
-
-        /* DtoH energies */
-        if (stepWork.computeEnergy)
-        {
-            GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
-                       "Sizes of host- and device-side LJ energy terms should be the same.");
-            copyFromDeviceBuffer(nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-            GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
-                       "Sizes of host- and device-side electrostatic energy terms should be the "
-                       "same.");
-            copyFromDeviceBuffer(nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async,
-                                 bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-        }
-    }
-
-    if (bDoTime)
-    {
-        t->xf[aloc].nb_d2h.closeTimingRegion(deviceStream);
-    }
-}
-
 } // namespace Nbnxm