Fix random typos

[alexxy/gromacs.git] / src / gromacs / nbnxm / opencl / nbnxm_ocl.cpp
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 12d38ed2677adf97a65a55a3977e13d087d5db34..b4b28c06526441dbc7f2b5e162a4a1d56bd4bb13 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -68,10 +68,10 @@
  #    include <limits>
  #endif
  
-#include "thread_mpi/atomic.h"
-
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gputraits_ocl.h"
  #include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/hardware/device_information.h"
  #include "gromacs/hardware/hw_info.h"
  #include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/nbnxm/atomdata.h"
@@ -81,13 +81,11 @@
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/nbnxm_gpu.h"
  #include "gromacs/nbnxm/pairlist.h"
-#include "gromacs/pbcutil/ishift.h"
  #include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  
-#include "nbnxm_ocl_internal.h"
  #include "nbnxm_ocl_types.h"
  
  namespace Nbnxm
@@ -103,7 +101,7 @@ static constexpr int c_clSize = c_nbnxnGpuClusterSize;
   */
  static inline void validate_global_work_size(const KernelLaunchConfig& config,
                                               int                       work_dim,
-                                             const gmx_device_info_t*  dinfo)
+                                             const DeviceInformation*  dinfo)
  {
      cl_uint device_size_t_size_bits;
      cl_uint host_size_t_size_bits;
@@ -146,7 +144,8 @@ static inline void validate_global_work_size(const KernelLaunchConfig& config,
                          "Watch out, the input system is too large to simulate!\n"
                          "The number of nonbonded work units (=number of super-clusters) exceeds the"
                          "device capabilities. Global work size limit exceeded (%zu > %zu)!",
-                        global_work_size[i], device_limit);
+                        global_work_size[i],
+                        device_limit);
              }
          }
      }
@@ -161,17 +160,25 @@ static inline void validate_global_work_size(const KernelLaunchConfig& config,
   */
  
  /*! \brief Force-only kernel function names. */
-static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] = {
-    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
+static const char* nb_kfunc_noener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
+    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
        "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
-    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
        "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
-    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
-      "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",
        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",
        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
@@ -182,31 +189,43 @@ static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] = {
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl",
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl",
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
        "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
      { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",
        "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",
        "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",
-      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
        "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",
        "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
  };
  
  /*! \brief Force + energy kernel function pointers. */
-static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] = {
-    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
+static const char* nb_kfunc_ener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
+    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
        "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
-    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
+    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
        "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
-    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",
-      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",
        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
      { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl",
@@ -216,9 +235,12 @@ static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] = {
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl",
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl",
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
+    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
        "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
      { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",
        "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",
@@ -230,16 +252,19 @@ static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] = {
  };
  
  /*! \brief Force + pruning kernel function pointers. */
-static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] = {
+static const char* nb_kfunc_noener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
      { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",
        "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",
        "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",
-      "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
        "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",
        "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
-    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
        "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
        "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",
        "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
      { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",
@@ -256,9 +281,11 @@ static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] = {
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl",
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl",
        "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
        "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
        "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",
        "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
      { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",
@@ -271,7 +298,7 @@ static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] = {
  };
  
  /*! \brief Force + energy + pruning kernel function pointers. */
-static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] = {
+static const char* nb_kfunc_ener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
      { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",
        "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",
        "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",
@@ -282,7 +309,8 @@ static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] = {
      { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",
        "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",
        "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",
-      "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
        "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",
        "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
      { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",
@@ -302,7 +330,8 @@ static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] = {
      { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",
        "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",
        "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",
-      "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
        "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",
        "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
      { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",
@@ -340,41 +369,45 @@ static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[], bool fir
   *  OpenCL kernel objects are cached in nb. If the requested kernel is not
   *  found in the cache, it will be created and the cache will be updated.
   */
-static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwtype, bool bDoEne, bool bDoPrune)
+static inline cl_kernel
+select_nbnxn_kernel(NbnxmGpu* nb, enum ElecType elecType, enum VdwType vdwType, bool bDoEne, bool bDoPrune)
  {
      const char* kernel_name_to_run;
      cl_kernel*  kernel_ptr;
      cl_int      cl_error;
  
-    GMX_ASSERT(eeltype < eelOclNR,
+    const int elecTypeIdx = static_cast<int>(elecType);
+    const int vdwTypeIdx  = static_cast<int>(vdwType);
+
+    GMX_ASSERT(elecTypeIdx < c_numElecTypes,
                 "The electrostatics type requested is not implemented in the OpenCL kernels.");
-    GMX_ASSERT(evdwtype < evdwOclNR,
+    GMX_ASSERT(vdwTypeIdx < c_numVdwTypes,
                 "The VdW type requested is not implemented in the OpenCL kernels.");
  
      if (bDoEne)
      {
          if (bDoPrune)
          {
-            kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
-            kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
+            kernel_name_to_run = nb_kfunc_ener_prune_ptr[elecTypeIdx][vdwTypeIdx];
+            kernel_ptr         = &(nb->kernel_ener_prune_ptr[elecTypeIdx][vdwTypeIdx]);
          }
          else
          {
-            kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
-            kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
+            kernel_name_to_run = nb_kfunc_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
+            kernel_ptr         = &(nb->kernel_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx]);
          }
      }
      else
      {
          if (bDoPrune)
          {
-            kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
-            kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
+            kernel_name_to_run = nb_kfunc_noener_prune_ptr[elecTypeIdx][vdwTypeIdx];
+            kernel_ptr         = &(nb->kernel_noener_prune_ptr[elecTypeIdx][vdwTypeIdx]);
          }
          else
          {
-            kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
-            kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
+            kernel_name_to_run = nb_kfunc_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
+            kernel_ptr         = &(nb->kernel_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx]);
          }
      }
  
@@ -382,7 +415,9 @@ static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwt
      {
          *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
          GMX_ASSERT(cl_error == CL_SUCCESS,
-                   ("clCreateKernel failed: " + ocl_get_error_string(cl_error)).c_str());
+                   ("clCreateKernel failed: " + ocl_get_error_string(cl_error)
+                    + " for kernel named " + kernel_name_to_run)
+                           .c_str());
      }
  
      return *kernel_ptr;
@@ -390,7 +425,7 @@ static inline cl_kernel select_nbnxn_kernel(NbnxmGpu* nb, int eeltype, int evdwt
  
  /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
   */
-static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjParam)
+static inline int calc_shmem_required_nonbonded(enum VdwType vdwType, bool bPrefetchLjParam)
  {
      int shmem;
  
@@ -431,12 +466,12 @@ static inline int calc_shmem_required_nonbonded(int vdwType, bool bPrefetchLjPar
   *
   *  This function is called before the launch of both nbnxn and prune kernels.
   */
-static void fillin_ocl_structures(cl_nbparam_t* nbp, cl_nbparam_params_t* nbparams_params)
+static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams_params)
  {
      nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
      nbparams_params->c_rf              = nbp->c_rf;
      nbparams_params->dispersion_shift  = nbp->dispersion_shift;
-    nbparams_params->eeltype           = nbp->eeltype;
+    nbparams_params->elecType          = nbp->elecType;
      nbparams_params->epsfac            = nbp->epsfac;
      nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
      nbparams_params->ewald_beta        = nbp->ewald_beta;
@@ -449,119 +484,10 @@ static void fillin_ocl_structures(cl_nbparam_t* nbp, cl_nbparam_params_t* nbpara
      nbparams_params->sh_ewald          = nbp->sh_ewald;
      nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
      nbparams_params->two_k_rf          = nbp->two_k_rf;
-    nbparams_params->vdwtype           = nbp->vdwtype;
+    nbparams_params->vdwType           = nbp->vdwType;
      nbparams_params->vdw_switch        = nbp->vdw_switch;
  }
  
-/*! \brief Enqueues a wait for event completion.
- *
- * Then it releases the event and sets it to 0.
- * Don't use this function when more than one wait will be issued for the event.
- * Equivalent to Cuda Stream Sync. */
-static void sync_ocl_event(cl_command_queue stream, cl_event* ocl_event)
-{
-    cl_int gmx_unused cl_error;
-
-    /* Enqueue wait */
-    cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
-    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
-
-    /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
-    cl_error = clReleaseEvent(*ocl_event);
-    GMX_ASSERT(cl_error == CL_SUCCESS,
-               ("clReleaseEvent failed: " + ocl_get_error_string(cl_error)).c_str());
-    *ocl_event = nullptr;
-}
-
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
-    /* local/nonlocal offset and length used for xq and f */
-    int adat_begin, adat_len;
-
-    cl_atomdata_t*   adat   = nb->atdat;
-    cl_plist_t*      plist  = nb->plist[iloc];
-    cl_timers_t*     t      = nb->timers;
-    cl_command_queue stream = nb->stream[iloc];
-
-    bool bDoTime = (nb->bDoTime) != 0;
-
-    /* Don't launch the non-local H2D copy if there is no dependent
-       work to do: neither non-local nor other (e.g. bonded) work
-       to do that has as input the nbnxn coordinates.
-       Doing the same for the local kernel is more complicated, since the
-       local part of the force array also depends on the non-local kernel.
-       So to avoid complicating the code and to reduce the risk of bugs,
-       we always call the local local x+q copy (and the rest of the local
-       work in nbnxn_gpu_launch_kernel().
-     */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        plist->haveFreshList = false;
-
-        return;
-    }
-
-    /* calculate the atom data index range based on locality */
-    if (atomLocality == AtomLocality::Local)
-    {
-        adat_begin = 0;
-        adat_len   = adat->natoms_local;
-    }
-    else
-    {
-        adat_begin = adat->natoms_local;
-        adat_len   = adat->natoms - adat->natoms_local;
-    }
-
-    /* beginning of timed HtoD section */
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
-    }
-
-    /* HtoD x, q */
-    ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4,
-                       adat_begin * sizeof(float) * 4, adat_len * sizeof(float) * 4, stream,
-                       bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
-    }
-
-    /* When we get here all misc operations issues in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one. */
-    if (nb->bUseTwoStreams)
-    {
-        if (iloc == InteractionLocality::Local)
-        {
-            cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(
-                    stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-
-            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-             * in the local stream in order to be able to sync with the above event
-             * from the non-local stream.
-             */
-            cl_error = clFlush(stream);
-            GMX_ASSERT(cl_error == CL_SUCCESS,
-                       ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-        }
-        else
-        {
-            sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
-        }
-    }
-}
-
-
  /*! \brief Launch GPU kernel
  
     As we execute nonbonded workload in separate queues, before launching
@@ -582,13 +508,13 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
   */
  void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
  {
-    cl_atomdata_t*   adat   = nb->atdat;
-    cl_nbparam_t*    nbp    = nb->nbparam;
-    cl_plist_t*      plist  = nb->plist[iloc];
-    cl_timers_t*     t      = nb->timers;
-    cl_command_queue stream = nb->stream[iloc];
+    NBAtomDataGpu*      adat         = nb->atdat;
+    NBParamGpu*         nbp          = nb->nbparam;
+    gpu_plist*          plist        = nb->plist[iloc];
+    Nbnxm::GpuTimers*   timers       = nb->timers;
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
-    bool bDoTime = (nb->bDoTime) != 0;
+    bool bDoTime = nb->bDoTime;
  
      cl_nbparam_params_t nbparams_params;
  
@@ -627,65 +553,97 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
      /* beginning of timed nonbonded calculation section */
      if (bDoTime)
      {
-        t->interaction[iloc].nb_k.openTimingRegion(stream);
+        timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
      }
  
      /* kernel launch config */
  
      KernelLaunchConfig config;
-    config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
-    config.stream           = stream;
+    config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwType, nb->bPrefetchLjParam);
      config.blockSize[0]     = c_clSize;
      config.blockSize[1]     = c_clSize;
      config.gridSize[0]      = plist->nsci;
  
-    validate_global_work_size(config, 3, nb->dev_info);
+    validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
  
      if (debug)
      {
          fprintf(debug,
                  "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
                  "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
-                config.blockSize[0], config.blockSize[1], config.blockSize[2],
-                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
+                config.blockSize[0],
+                config.blockSize[1],
+                config.blockSize[2],
+                config.blockSize[0] * config.gridSize[0],
+                config.blockSize[1] * config.gridSize[1],
                  plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
-                c_nbnxnGpuNumClusterPerSupercluster, plist->na_c);
+                c_nbnxnGpuNumClusterPerSupercluster,
+                plist->na_c);
      }
  
      fillin_ocl_structures(nbp, &nbparams_params);
  
-    auto*          timingEvent  = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+    auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
      constexpr char kernelName[] = "k_calc_nb";
      const auto     kernel =
-            select_nbnxn_kernel(nb, nbp->eeltype, nbp->vdwtype, stepWork.computeEnergy,
+            select_nbnxn_kernel(nb,
+                                nbp->elecType,
+                                nbp->vdwType,
+                                stepWork.computeEnergy,
                                  (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
  
  
      // The OpenCL kernel takes int as second to last argument because bool is
      // not supported as a kernel argument type (sizeof(bool) is implementation defined).
      const int computeFshift = static_cast<int>(stepWork.computeVirial);
-    if (useLjCombRule(nb->nbparam->vdwtype))
-    {
-        const auto kernelArgs = prepareGpuKernelArguments(
-                kernel, config, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el,
-                &adat->fshift, &adat->lj_comb, &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d,
-                &nbp->coulomb_tab_climg2d, &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
-
-        launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        const auto kernelArgs = prepareGpuKernelArguments(kernel,
+                                                          config,
+                                                          &nbparams_params,
+                                                          &adat->xq,
+                                                          &adat->f,
+                                                          &adat->eLJ,
+                                                          &adat->eElec,
+                                                          &adat->fShift,
+                                                          &adat->ljComb,
+                                                          &adat->shiftVec,
+                                                          &nbp->nbfp,
+                                                          &nbp->nbfp_comb,
+                                                          &nbp->coulomb_tab,
+                                                          &plist->sci,
+                                                          &plist->cj4,
+                                                          &plist->excl,
+                                                          &computeFshift);
+
+        launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
      }
      else
      {
-        const auto kernelArgs = prepareGpuKernelArguments(
-                kernel, config, &adat->ntypes, &nbparams_params, &adat->xq, &adat->f, &adat->e_lj,
-                &adat->e_el, &adat->fshift, &adat->atom_types, &adat->shift_vec, &nbp->nbfp_climg2d,
-                &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d, &plist->sci, &plist->cj4,
-                &plist->excl, &computeFshift);
-        launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+        const auto kernelArgs = prepareGpuKernelArguments(kernel,
+                                                          config,
+                                                          &adat->numTypes,
+                                                          &nbparams_params,
+                                                          &adat->xq,
+                                                          &adat->f,
+                                                          &adat->eLJ,
+                                                          &adat->eElec,
+                                                          &adat->fShift,
+                                                          &adat->atomTypes,
+                                                          &adat->shiftVec,
+                                                          &nbp->nbfp,
+                                                          &nbp->nbfp_comb,
+                                                          &nbp->coulomb_tab,
+                                                          &plist->sci,
+                                                          &plist->cj4,
+                                                          &plist->excl,
+                                                          &computeFshift);
+        launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
      }
  
      if (bDoTime)
      {
-        t->interaction[iloc].nb_k.closeTimingRegion(stream);
+        timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
      }
  }
  
@@ -696,7 +654,8 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
   *  for OpenCL local memory.
   *
   * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
- * dimension. \returns   the amount of local memory in bytes required by the pruning kernel
+ * dimension.
+ * \returns   the amount of local memory in bytes required by the pruning kernel
   */
  static inline int calc_shmem_required_prune(const int num_threads_z)
  {
@@ -721,12 +680,12 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
   */
  void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
  {
-    cl_atomdata_t*   adat    = nb->atdat;
-    cl_nbparam_t*    nbp     = nb->nbparam;
-    cl_plist_t*      plist   = nb->plist[iloc];
-    cl_timers_t*     t       = nb->timers;
-    cl_command_queue stream  = nb->stream[iloc];
-    bool             bDoTime = nb->bDoTime == CL_TRUE;
+    NBAtomDataGpu*      adat         = nb->atdat;
+    NBParamGpu*         nbp          = nb->nbparam;
+    gpu_plist*          plist        = nb->plist[iloc];
+    Nbnxm::GpuTimers*   timers       = nb->timers;
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+    bool                bDoTime      = nb->bDoTime;
  
      if (plist->haveFreshList)
      {
@@ -774,13 +733,14 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      GpuRegionTimer* timer = nullptr;
      if (bDoTime)
      {
-        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+        timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+                                       : timers->interaction[iloc].rollingPrune_k);
      }
  
      /* beginning of timed prune calculation section */
      if (bDoTime)
      {
-        timer->openTimingRegion(stream);
+        timer->openTimingRegion(deviceStream);
      }
  
      /* Kernel launch config:
@@ -788,18 +748,16 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
       *   and j-cluster concurrency, in x, y, and z, respectively.
       * - The 1D block-grid contains as many blocks as super-clusters.
       */
-    int num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
-
+    int num_threads_z = c_pruneKernelJ4Concurrency;
      /* kernel launch config */
      KernelLaunchConfig config;
      config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
-    config.stream           = stream;
      config.blockSize[0]     = c_clSize;
      config.blockSize[1]     = c_clSize;
      config.blockSize[2]     = num_threads_z;
      config.gridSize[0]      = numSciInPart;
  
-    validate_global_work_size(config, 3, nb->dev_info);
+    validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
  
      if (debug)
      {
@@ -807,10 +765,15 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
                  "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
                  "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
                  "\tShMem: %zu\n",
-                config.blockSize[0], config.blockSize[1], config.blockSize[2],
-                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1],
+                config.blockSize[0],
+                config.blockSize[1],
+                config.blockSize[2],
+                config.blockSize[0] * config.gridSize[0],
+                config.blockSize[1] * config.gridSize[1],
                  plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
-                c_nbnxnGpuNumClusterPerSupercluster, plist->na_c, config.sharedMemorySize);
+                c_nbnxnGpuNumClusterPerSupercluster,
+                plist->na_c,
+                config.sharedMemorySize);
      }
  
      cl_nbparam_params_t nbparams_params;
@@ -819,10 +782,17 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
      auto*          timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
      constexpr char kernelName[] = "k_pruneonly";
      const auto     pruneKernel  = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
-    const auto     kernelArgs   = prepareGpuKernelArguments(pruneKernel, config, &nbparams_params,
-                                                      &adat->xq, &adat->shift_vec, &plist->sci,
-                                                      &plist->cj4, &plist->imask, &numParts, &part);
-    launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs);
+    const auto     kernelArgs   = prepareGpuKernelArguments(pruneKernel,
+                                                      config,
+                                                      &nbparams_params,
+                                                      &adat->xq,
+                                                      &adat->shiftVec,
+                                                      &plist->sci,
+                                                      &plist->cj4,
+                                                      &plist->imask,
+                                                      &numParts,
+                                                      &part);
+    launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
  
      if (plist->haveFreshList)
      {
@@ -838,166 +808,8 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
  
      if (bDoTime)
      {
-        timer->closeTimingRegion(stream);
+        timer->closeTimingRegion(deviceStream);
      }
  }
  
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(NbnxmGpu*                nb,
-                        struct nbnxn_atomdata_t* nbatom,
-                        const gmx::StepWorkload& stepWork,
-                        const AtomLocality       aloc)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    cl_int gmx_unused cl_error;
-    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
-
-    /* determine interaction locality from atom locality */
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
-
-    cl_atomdata_t*   adat    = nb->atdat;
-    cl_timers_t*     t       = nb->timers;
-    bool             bDoTime = nb->bDoTime == CL_TRUE;
-    cl_command_queue stream  = nb->stream[iloc];
-
-    /* don't launch non-local copy-back if there was no non-local work to do */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        /* TODO An alternative way to signal that non-local work is
-           complete is to use a clEnqueueMarker+clEnqueueBarrier
-           pair. However, the use of bNonLocalStreamActive has the
-           advantage of being local to the host, so probably minimizes
-           overhead. Curiously, for NVIDIA OpenCL with an empty-domain
-           test case, overall simulation performance was higher with
-           the API calls, but this has not been tested on AMD OpenCL,
-           so could be worth considering in future. */
-        nb->bNonLocalStreamActive = CL_FALSE;
-        return;
-    }
-
-    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
-
-    /* beginning of timed D2H section */
-    if (bDoTime)
-    {
-        t->xf[aloc].nb_d2h.openTimingRegion(stream);
-    }
-
-    /* With DD the local D2H transfer can only start after the non-local
-       has been launched. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
-    {
-        sync_ocl_event(stream, &(nb->nonlocal_done));
-    }
-
-    /* DtoH f */
-    ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * DIM, adat->f,
-                       adat_begin * DIM * sizeof(nbatom->out[0].f[0]),
-                       adat_len * DIM * sizeof(nbatom->out[0].f[0]), stream,
-                       bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
-    /* kick off work */
-    cl_error = clFlush(stream);
-    GMX_ASSERT(cl_error == CL_SUCCESS, ("clFlush failed: " + ocl_get_error_string(cl_error)).c_str());
-
-    /* After the non-local D2H is launched the nonlocal_done event can be
-       recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we first need the non-local
-       data back first. */
-    if (iloc == InteractionLocality::NonLocal)
-    {
-        cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
-        GMX_ASSERT(cl_error == CL_SUCCESS,
-                   ("clEnqueueMarkerWithWaitList failed: " + ocl_get_error_string(cl_error)).c_str());
-        nb->bNonLocalStreamActive = CL_TRUE;
-    }
-
-    /* only transfer energies in the local stream */
-    if (iloc == InteractionLocality::Local)
-    {
-        /* DtoH fshift when virial is needed */
-        if (stepWork.computeVirial)
-        {
-            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0, SHIFTS * sizeof(nb->nbst.fshift[0]),
-                               stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-        }
-
-        /* DtoH energies */
-        if (stepWork.computeEnergy)
-        {
-            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0, sizeof(float), stream,
-                               bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
-            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0, sizeof(float), stream,
-                               bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-        }
-    }
-
-    if (bDoTime)
-    {
-        t->xf[aloc].nb_d2h.closeTimingRegion(stream);
-    }
-}
-
-
-/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
-int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic)
-{
-    bool bTwinCut = (ic.rcoulomb != ic.rvdw);
-    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
-    int  kernel_type;
-
-    /* Benchmarking/development environment variables to force the use of
-       analytical or tabulated Ewald kernel. */
-    bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
-    bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
-
-    if (bForceAnalyticalEwald && bForceTabulatedEwald)
-    {
-        gmx_incons(
-                "Both analytical and tabulated Ewald OpenCL non-bonded kernels "
-                "requested through environment variables.");
-    }
-
-    /* OpenCL: By default, use analytical Ewald
-     * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
-     *
-     */
-    /* By default use analytical Ewald. */
-    bUseAnalyticalEwald = true;
-    if (bForceAnalyticalEwald)
-    {
-        if (debug)
-        {
-            fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
-        }
-    }
-    else if (bForceTabulatedEwald)
-    {
-        bUseAnalyticalEwald = false;
-
-        if (debug)
-        {
-            fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
-        }
-    }
-
-    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
-       forces it (use it for debugging/benchmarking only). */
-    if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == nullptr))
-    {
-        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
-    }
-    else
-    {
-        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
-    }
-
-    return kernel_type;
-}
-
  } // namespace Nbnxm