Fix random typos

[alexxy/gromacs.git] / src / gromacs / nbnxm / opencl / nbnxm_ocl.cpp
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp

index 2ce49b2eb995736ec74de1d52799fd071af94d36..b4b28c06526441dbc7f2b5e162a4a1d56bd4bb13 100644 (file)
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -1,7 +1,8 @@
  /*
   * This file is part of the GROMACS molecular simulation package.
   *
- * Copyright (c) 2012,2013,2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
+ * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -64,13 +65,13 @@
  #include <stdlib.h>
  
  #if defined(_MSVC)
-#include <limits>
+#    include <limits>
  #endif
  
-#include "thread_mpi/atomic.h"
-
+#include "gromacs/gpu_utils/device_context.h"
  #include "gromacs/gpu_utils/gputraits_ocl.h"
  #include "gromacs/gpu_utils/oclutils.h"
+#include "gromacs/hardware/device_information.h"
  #include "gromacs/hardware/hw_info.h"
  #include "gromacs/mdtypes/simulation_workload.h"
  #include "gromacs/nbnxm/atomdata.h"
@@ -80,13 +81,11 @@
  #include "gromacs/nbnxm/nbnxm.h"
  #include "gromacs/nbnxm/nbnxm_gpu.h"
  #include "gromacs/nbnxm/pairlist.h"
-#include "gromacs/pbcutil/ishift.h"
  #include "gromacs/timing/gpu_timing.h"
  #include "gromacs/utility/cstringutil.h"
  #include "gromacs/utility/fatalerror.h"
  #include "gromacs/utility/gmxassert.h"
  
-#include "nbnxm_ocl_internal.h"
  #include "nbnxm_ocl_types.h"
  
  namespace Nbnxm
@@ -94,19 +93,20 @@ namespace Nbnxm
  
  /*! \brief Convenience constants */
  //@{
-static const int c_numClPerSupercl = c_nbnxnGpuNumClusterPerSupercluster;
-static const int c_clSize          = c_nbnxnGpuClusterSize;
+static constexpr int c_clSize = c_nbnxnGpuClusterSize;
  //@}
  
  
  /*! \brief Validates the input global work size parameter.
   */
-static inline void validate_global_work_size(const KernelLaunchConfig &config, int work_dim, const gmx_device_info_t *dinfo)
+static inline void validate_global_work_size(const KernelLaunchConfig& config,
+                                             int                       work_dim,
+                                             const DeviceInformation*  dinfo)
  {
      cl_uint device_size_t_size_bits;
      cl_uint host_size_t_size_bits;
  
-    assert(dinfo);
+    GMX_ASSERT(dinfo, "Need a valid device info object");
  
      size_t global_work_size[3];
      GMX_ASSERT(work_dim <= 3, "Not supporting hyper-grids just yet");
@@ -139,10 +139,13 @@ static inline void validate_global_work_size(const KernelLaunchConfig &config, i
          {
              if (global_work_size[i] > device_limit)
              {
-                gmx_fatal(FARGS, "Watch out, the input system is too large to simulate!\n"
-                          "The number of nonbonded work units (=number of super-clusters) exceeds the"
-                          "device capabilities. Global work size limit exceeded (%zu > %zu)!",
-                          global_work_size[i], device_limit);
+                gmx_fatal(
+                        FARGS,
+                        "Watch out, the input system is too large to simulate!\n"
+                        "The number of nonbonded work units (=number of super-clusters) exceeds the"
+                        "device capabilities. Global work size limit exceeded (%zu > %zu)!",
+                        global_work_size[i],
+                        device_limit);
              }
          }
      }
@@ -157,47 +160,187 @@ static inline void validate_global_work_size(const KernelLaunchConfig &config, i
   */
  
  /*! \brief Force-only kernel function names. */
-static const char* nb_kfunc_noener_noprune_ptr[eelOclNR][evdwOclNR] =
-{
-    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl"            },
-    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl"             },
-    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl"        },
-    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl"             },
-    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl"      }
+static const char* nb_kfunc_noener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
+    { "nbnxn_kernel_ElecCut_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_opencl" },
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_opencl" },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_opencl" },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_opencl" },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_opencl" }
  };
  
  /*! \brief Force + energy kernel function pointers. */
-static const char* nb_kfunc_ener_noprune_ptr[eelOclNR][evdwOclNR] =
-{
-    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl"            },
-    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl"             },
-    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl"        },
-    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl"             },
-    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl"      }
+static const char* nb_kfunc_ener_noprune_ptr[c_numElecTypes][c_numVdwTypes] = {
+    { "nbnxn_kernel_ElecCut_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_opencl" },
+    { "nbnxn_kernel_ElecRF_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_opencl" },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_opencl" },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_opencl" },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_opencl" }
  };
  
  /*! \brief Force + pruning kernel function pointers. */
-static const char* nb_kfunc_noener_prune_ptr[eelOclNR][evdwOclNR] =
-{
-    { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl"             },
-    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl"              },
-    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl"         },
-    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl"  },
-    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl"              },
-    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl"       }
+static const char* nb_kfunc_noener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
+    { "nbnxn_kernel_ElecCut_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombLB_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombLB_F_prune_opencl" },
+    { "nbnxn_kernel_ElecRF_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombLB_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombLB_F_prune_opencl" },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_F_prune_opencl" },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_F_prune_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombLB_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombLB_F_prune_opencl" },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_F_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_F_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_F_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_F_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_F_prune_opencl" }
  };
  
  /*! \brief Force + energy + pruning kernel function pointers. */
-static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
-{
-    { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",            "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl"            },
-    { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl"             },
-    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",        "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl"        },
-    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl", "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
-    { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",             "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl"             },
-    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl"      }
+static const char* nb_kfunc_ener_prune_ptr[c_numElecTypes][c_numVdwTypes] = {
+    { "nbnxn_kernel_ElecCut_VdwLJ_VF_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJCombLB_VF_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecCut_VdwLJEwCombLB_VF_prune_opencl" },
+    { "nbnxn_kernel_ElecRF_VdwLJ_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJCombLB_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecRF_VdwLJEwCombLB_VF_prune_opencl" },
+    { "nbnxn_kernel_ElecEwQSTab_VdwLJ_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJCombLB_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTab_VdwLJEwCombLB_VF_prune_opencl" },
+    { "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJ_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJCombLB_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwQSTabTwinCut_VdwLJEwCombLB_VF_prune_opencl" },
+    { "nbnxn_kernel_ElecEw_VdwLJ_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJCombLB_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecEw_VdwLJEwCombLB_VF_prune_opencl" },
+    { "nbnxn_kernel_ElecEwTwinCut_VdwLJ_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJCombLB_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJFsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJPsw_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombGeom_VF_prune_opencl",
+      "nbnxn_kernel_ElecEwTwinCut_VdwLJEwCombLB_VF_prune_opencl" }
  };
  
  /*! \brief Return a pointer to the prune kernel version to be executed at the current invocation.
@@ -205,10 +348,9 @@ static const char* nb_kfunc_ener_prune_ptr[eelOclNR][evdwOclNR] =
   * \param[in] kernel_pruneonly  array of prune kernel objects
   * \param[in] firstPrunePass    true if the first pruning pass is being executed
   */
-static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
-                                          bool      firstPrunePass)
+static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[], bool firstPrunePass)
  {
-    cl_kernel  *kernelPtr;
+    cl_kernel* kernelPtr;
  
      if (firstPrunePass)
      {
@@ -227,88 +369,92 @@ static inline cl_kernel selectPruneKernel(cl_kernel kernel_pruneonly[],
   *  OpenCL kernel objects are cached in nb. If the requested kernel is not
   *  found in the cache, it will be created and the cache will be updated.
   */
-static inline cl_kernel select_nbnxn_kernel(gmx_nbnxn_ocl_t   *nb,
-                                            int                eeltype,
-                                            int                evdwtype,
-                                            bool               bDoEne,
-                                            bool               bDoPrune)
+static inline cl_kernel
+select_nbnxn_kernel(NbnxmGpu* nb, enum ElecType elecType, enum VdwType vdwType, bool bDoEne, bool bDoPrune)
  {
      const char* kernel_name_to_run;
-    cl_kernel  *kernel_ptr;
+    cl_kernel*  kernel_ptr;
      cl_int      cl_error;
  
-    assert(eeltype  < eelOclNR);
-    assert(evdwtype < evdwOclNR);
+    const int elecTypeIdx = static_cast<int>(elecType);
+    const int vdwTypeIdx  = static_cast<int>(vdwType);
+
+    GMX_ASSERT(elecTypeIdx < c_numElecTypes,
+               "The electrostatics type requested is not implemented in the OpenCL kernels.");
+    GMX_ASSERT(vdwTypeIdx < c_numVdwTypes,
+               "The VdW type requested is not implemented in the OpenCL kernels.");
  
      if (bDoEne)
      {
          if (bDoPrune)
          {
-            kernel_name_to_run = nb_kfunc_ener_prune_ptr[eeltype][evdwtype];
-            kernel_ptr         = &(nb->kernel_ener_prune_ptr[eeltype][evdwtype]);
+            kernel_name_to_run = nb_kfunc_ener_prune_ptr[elecTypeIdx][vdwTypeIdx];
+            kernel_ptr         = &(nb->kernel_ener_prune_ptr[elecTypeIdx][vdwTypeIdx]);
          }
          else
          {
-            kernel_name_to_run = nb_kfunc_ener_noprune_ptr[eeltype][evdwtype];
-            kernel_ptr         = &(nb->kernel_ener_noprune_ptr[eeltype][evdwtype]);
+            kernel_name_to_run = nb_kfunc_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
+            kernel_ptr         = &(nb->kernel_ener_noprune_ptr[elecTypeIdx][vdwTypeIdx]);
          }
      }
      else
      {
          if (bDoPrune)
          {
-            kernel_name_to_run = nb_kfunc_noener_prune_ptr[eeltype][evdwtype];
-            kernel_ptr         = &(nb->kernel_noener_prune_ptr[eeltype][evdwtype]);
+            kernel_name_to_run = nb_kfunc_noener_prune_ptr[elecTypeIdx][vdwTypeIdx];
+            kernel_ptr         = &(nb->kernel_noener_prune_ptr[elecTypeIdx][vdwTypeIdx]);
          }
          else
          {
-            kernel_name_to_run = nb_kfunc_noener_noprune_ptr[eeltype][evdwtype];
-            kernel_ptr         = &(nb->kernel_noener_noprune_ptr[eeltype][evdwtype]);
+            kernel_name_to_run = nb_kfunc_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx];
+            kernel_ptr         = &(nb->kernel_noener_noprune_ptr[elecTypeIdx][vdwTypeIdx]);
          }
      }
  
      if (nullptr == kernel_ptr[0])
      {
          *kernel_ptr = clCreateKernel(nb->dev_rundata->program, kernel_name_to_run, &cl_error);
-        assert(cl_error == CL_SUCCESS);
+        GMX_ASSERT(cl_error == CL_SUCCESS,
+                   ("clCreateKernel failed: " + ocl_get_error_string(cl_error)
+                    + " for kernel named " + kernel_name_to_run)
+                           .c_str());
      }
-    // TODO: handle errors
  
      return *kernel_ptr;
  }
  
  /*! \brief Calculates the amount of shared memory required by the nonbonded kernel in use.
   */
-static inline int calc_shmem_required_nonbonded(int  vdwType,
-                                                bool bPrefetchLjParam)
+static inline int calc_shmem_required_nonbonded(enum VdwType vdwType, bool bPrefetchLjParam)
  {
      int shmem;
  
      /* size of shmem (force-buffers/xq/atom type preloading) */
      /* NOTE: with the default kernel on sm3.0 we need shmem only for pre-loading */
      /* i-atom x+q in shared memory */
-    shmem  = c_numClPerSupercl * c_clSize * sizeof(float) * 4; /* xqib */
+    shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4; /* xqib */
      /* cj in shared memory, for both warps separately
       * TODO: in the "nowarp kernels we load cj only once  so the factor 2 is not needed.
       */
-    shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int);           /* cjs  */
+    shmem += 2 * c_nbnxnGpuJgroupSize * sizeof(int); /* cjs  */
      if (bPrefetchLjParam)
      {
          if (useLjCombRule(vdwType))
          {
              /* i-atom LJ combination parameters in shared memory */
-            shmem += c_numClPerSupercl * c_clSize * 2*sizeof(float); /* atib abused for ljcp, float2 */
+            shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * 2
+                     * sizeof(float); /* atib abused for ljcp, float2 */
          }
          else
          {
              /* i-atom types in shared memory */
-            shmem += c_numClPerSupercl * c_clSize * sizeof(int); /* atib */
+            shmem += c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(int); /* atib */
          }
      }
      /* force reduction buffers in shared memory */
-    shmem += c_clSize * c_clSize * 3 * sizeof(float);    /* f_buf */
+    shmem += c_clSize * c_clSize * 3 * sizeof(float); /* f_buf */
      /* Warp vote. In fact it must be * number of warps in block.. */
-    shmem += sizeof(cl_uint) * 2;                        /* warp_any */
+    shmem += sizeof(cl_uint) * 2; /* warp_any */
      return shmem;
  }
  
@@ -320,13 +466,12 @@ static inline int calc_shmem_required_nonbonded(int  vdwType,
   *
   *  This function is called before the launch of both nbnxn and prune kernels.
   */
-static void fillin_ocl_structures(cl_nbparam_t        *nbp,
-                                  cl_nbparam_params_t *nbparams_params)
+static void fillin_ocl_structures(NBParamGpu* nbp, cl_nbparam_params_t* nbparams_params)
  {
      nbparams_params->coulomb_tab_scale = nbp->coulomb_tab_scale;
      nbparams_params->c_rf              = nbp->c_rf;
      nbparams_params->dispersion_shift  = nbp->dispersion_shift;
-    nbparams_params->eeltype           = nbp->eeltype;
+    nbparams_params->elecType          = nbp->elecType;
      nbparams_params->epsfac            = nbp->epsfac;
      nbparams_params->ewaldcoeff_lj     = nbp->ewaldcoeff_lj;
      nbparams_params->ewald_beta        = nbp->ewald_beta;
@@ -339,116 +484,10 @@ static void fillin_ocl_structures(cl_nbparam_t        *nbp,
      nbparams_params->sh_ewald          = nbp->sh_ewald;
      nbparams_params->sh_lj_ewald       = nbp->sh_lj_ewald;
      nbparams_params->two_k_rf          = nbp->two_k_rf;
-    nbparams_params->vdwtype           = nbp->vdwtype;
+    nbparams_params->vdwType           = nbp->vdwType;
      nbparams_params->vdw_switch        = nbp->vdw_switch;
  }
  
-/*! \brief Enqueues a wait for event completion.
- *
- * Then it releases the event and sets it to 0.
- * Don't use this function when more than one wait will be issued for the event.
- * Equivalent to Cuda Stream Sync. */
-static void sync_ocl_event(cl_command_queue stream, cl_event *ocl_event)
-{
-    cl_int gmx_unused cl_error;
-
-    /* Enqueue wait */
-    cl_error = clEnqueueBarrierWithWaitList(stream, 1, ocl_event, nullptr);
-    GMX_RELEASE_ASSERT(CL_SUCCESS == cl_error, ocl_get_error_string(cl_error).c_str());
-
-    /* Release event and reset it to 0. It is ok to release it as enqueuewaitforevents performs implicit retain for events. */
-    cl_error = clReleaseEvent(*ocl_event);
-    assert(CL_SUCCESS == cl_error);
-    *ocl_event = nullptr;
-}
-
-/*! \brief Launch asynchronously the xq buffer host to device copy. */
-void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
-                        const nbnxn_atomdata_t *nbatom,
-                        const AtomLocality      atomLocality)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
-
-    /* local/nonlocal offset and length used for xq and f */
-    int                  adat_begin, adat_len;
-
-    cl_atomdata_t       *adat    = nb->atdat;
-    cl_plist_t          *plist   = nb->plist[iloc];
-    cl_timers_t         *t       = nb->timers;
-    cl_command_queue     stream  = nb->stream[iloc];
-
-    bool                 bDoTime = (nb->bDoTime) != 0;
-
-    /* Don't launch the non-local H2D copy if there is no dependent
-       work to do: neither non-local nor other (e.g. bonded) work
-       to do that has as input the nbnxn coordinates.
-       Doing the same for the local kernel is more complicated, since the
-       local part of the force array also depends on the non-local kernel.
-       So to avoid complicating the code and to reduce the risk of bugs,
-       we always call the local local x+q copy (and the rest of the local
-       work in nbnxn_gpu_launch_kernel().
-     */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        plist->haveFreshList = false;
-
-        return;
-    }
-
-    /* calculate the atom data index range based on locality */
-    if (atomLocality == AtomLocality::Local)
-    {
-        adat_begin  = 0;
-        adat_len    = adat->natoms_local;
-    }
-    else
-    {
-        adat_begin  = adat->natoms_local;
-        adat_len    = adat->natoms - adat->natoms_local;
-    }
-
-    /* beginning of timed HtoD section */
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.openTimingRegion(stream);
-    }
-
-    /* HtoD x, q */
-    ocl_copy_H2D_async(adat->xq, nbatom->x().data() + adat_begin * 4, adat_begin*sizeof(float)*4,
-                       adat_len * sizeof(float) * 4, stream, bDoTime ? t->xf[atomLocality].nb_h2d.fetchNextEvent() : nullptr);
-
-    if (bDoTime)
-    {
-        t->xf[atomLocality].nb_h2d.closeTimingRegion(stream);
-    }
-
-    /* When we get here all misc operations issues in the local stream as well as
-       the local xq H2D are done,
-       so we record that in the local stream and wait for it in the nonlocal one. */
-    if (nb->bUseTwoStreams)
-    {
-        if (iloc == InteractionLocality::Local)
-        {
-            cl_int gmx_used_in_debug cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->misc_ops_and_local_H2D_done));
-            assert(CL_SUCCESS == cl_error);
-
-            /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
-             * in the local stream in order to be able to sync with the above event
-             * from the non-local stream.
-             */
-            cl_error = clFlush(stream);
-            assert(CL_SUCCESS == cl_error);
-        }
-        else
-        {
-            sync_ocl_event(stream, &(nb->misc_ops_and_local_H2D_done));
-        }
-    }
-}
-
-
  /*! \brief Launch GPU kernel
  
     As we execute nonbonded workload in separate queues, before launching
@@ -467,19 +506,17 @@ void gpu_copy_xq_to_gpu(gmx_nbnxn_ocl_t        *nb,
     misc_ops_done event to record the point in time when the above  operations
     are finished and synchronize with this event in the non-local stream.
   */
-void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
-                       const gmx::StepWorkload          &stepWork,
-                       const Nbnxm::InteractionLocality  iloc)
+void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
  {
-    cl_atomdata_t       *adat    = nb->atdat;
-    cl_nbparam_t        *nbp     = nb->nbparam;
-    cl_plist_t          *plist   = nb->plist[iloc];
-    cl_timers_t         *t       = nb->timers;
-    cl_command_queue     stream  = nb->stream[iloc];
+    NBAtomDataGpu*      adat         = nb->atdat;
+    NBParamGpu*         nbp          = nb->nbparam;
+    gpu_plist*          plist        = nb->plist[iloc];
+    Nbnxm::GpuTimers*   timers       = nb->timers;
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
  
-    bool                 bDoTime     = (nb->bDoTime) != 0;
+    bool bDoTime = nb->bDoTime;
  
-    cl_nbparam_params_t  nbparams_params;
+    cl_nbparam_params_t nbparams_params;
  
      /* Don't launch the non-local kernel if there is no work to do.
         Doing the same for the local kernel is more complicated, since the
@@ -516,67 +553,97 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
      /* beginning of timed nonbonded calculation section */
      if (bDoTime)
      {
-        t->interaction[iloc].nb_k.openTimingRegion(stream);
+        timers->interaction[iloc].nb_k.openTimingRegion(deviceStream);
      }
  
      /* kernel launch config */
  
      KernelLaunchConfig config;
-    config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwtype, nb->bPrefetchLjParam);
-    config.stream           = stream;
+    config.sharedMemorySize = calc_shmem_required_nonbonded(nbp->vdwType, nb->bPrefetchLjParam);
      config.blockSize[0]     = c_clSize;
      config.blockSize[1]     = c_clSize;
      config.gridSize[0]      = plist->nsci;
  
-    validate_global_work_size(config, 3, nb->dev_info);
+    validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
  
      if (debug)
      {
-        fprintf(debug, "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+        fprintf(debug,
+                "Non-bonded GPU launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
                  "Global work size : %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n",
-                config.blockSize[0], config.blockSize[1], config.blockSize[2],
-                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
-                c_numClPerSupercl, plist->na_c);
+                config.blockSize[0],
+                config.blockSize[1],
+                config.blockSize[2],
+                config.blockSize[0] * config.gridSize[0],
+                config.blockSize[1] * config.gridSize[1],
+                plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+                c_nbnxnGpuNumClusterPerSupercluster,
+                plist->na_c);
      }
  
      fillin_ocl_structures(nbp, &nbparams_params);
  
-    auto          *timingEvent  = bDoTime ? t->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
+    auto* timingEvent = bDoTime ? timers->interaction[iloc].nb_k.fetchNextEvent() : nullptr;
      constexpr char kernelName[] = "k_calc_nb";
-    const auto     kernel       = select_nbnxn_kernel(nb,
-                                                      nbp->eeltype,
-                                                      nbp->vdwtype,
-                                                      stepWork.computeEnergy,
-                                                      (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
+    const auto     kernel =
+            select_nbnxn_kernel(nb,
+                                nbp->elecType,
+                                nbp->vdwType,
+                                stepWork.computeEnergy,
+                                (plist->haveFreshList && !nb->timers->interaction[iloc].didPrune));
  
  
      // The OpenCL kernel takes int as second to last argument because bool is
      // not supported as a kernel argument type (sizeof(bool) is implementation defined).
      const int computeFshift = static_cast<int>(stepWork.computeVirial);
-    if (useLjCombRule(nb->nbparam->vdwtype))
-    {
-        const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
-                                                          &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
-                                                          &adat->lj_comb,
-                                                          &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
-                                                          &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
-
-        launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        const auto kernelArgs = prepareGpuKernelArguments(kernel,
+                                                          config,
+                                                          &nbparams_params,
+                                                          &adat->xq,
+                                                          &adat->f,
+                                                          &adat->eLJ,
+                                                          &adat->eElec,
+                                                          &adat->fShift,
+                                                          &adat->ljComb,
+                                                          &adat->shiftVec,
+                                                          &nbp->nbfp,
+                                                          &nbp->nbfp_comb,
+                                                          &nbp->coulomb_tab,
+                                                          &plist->sci,
+                                                          &plist->cj4,
+                                                          &plist->excl,
+                                                          &computeFshift);
+
+        launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
      }
      else
      {
-        const auto kernelArgs = prepareGpuKernelArguments(kernel, config,
-                                                          &adat->ntypes,
-                                                          &nbparams_params, &adat->xq, &adat->f, &adat->e_lj, &adat->e_el, &adat->fshift,
-                                                          &adat->atom_types,
-                                                          &adat->shift_vec, &nbp->nbfp_climg2d, &nbp->nbfp_comb_climg2d, &nbp->coulomb_tab_climg2d,
-                                                          &plist->sci, &plist->cj4, &plist->excl, &computeFshift);
-        launchGpuKernel(kernel, config, timingEvent, kernelName, kernelArgs);
+        const auto kernelArgs = prepareGpuKernelArguments(kernel,
+                                                          config,
+                                                          &adat->numTypes,
+                                                          &nbparams_params,
+                                                          &adat->xq,
+                                                          &adat->f,
+                                                          &adat->eLJ,
+                                                          &adat->eElec,
+                                                          &adat->fShift,
+                                                          &adat->atomTypes,
+                                                          &adat->shiftVec,
+                                                          &nbp->nbfp,
+                                                          &nbp->nbfp_comb,
+                                                          &nbp->coulomb_tab,
+                                                          &plist->sci,
+                                                          &plist->cj4,
+                                                          &plist->excl,
+                                                          &computeFshift);
+        launchGpuKernel(kernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
      }
  
      if (bDoTime)
      {
-        t->interaction[iloc].nb_k.closeTimingRegion(stream);
+        timers->interaction[iloc].nb_k.closeTimingRegion(deviceStream);
      }
  }
  
@@ -586,7 +653,8 @@ void gpu_launch_kernel(gmx_nbnxn_ocl_t                  *nb,
   *  Note that for the sake of simplicity we use the CUDA terminology "shared memory"
   *  for OpenCL local memory.
   *
- * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd dimension.
+ * \param[in] num_threads_z cj4 concurrency equal to the number of threads/work items in the 3-rd
+ * dimension.
   * \returns   the amount of local memory in bytes required by the pruning kernel
   */
  static inline int calc_shmem_required_prune(const int num_threads_z)
@@ -594,28 +662,30 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
      int shmem;
  
      /* i-atom x in shared memory (for convenience we load all 4 components including q) */
-    shmem  = c_numClPerSupercl * c_clSize * sizeof(float)*4;
+    shmem = c_nbnxnGpuNumClusterPerSupercluster * c_clSize * sizeof(float) * 4;
      /* cj in shared memory, for each warp separately
       * Note: only need to load once per wavefront, but to keep the code simple,
       * for now we load twice on AMD.
       */
      shmem += num_threads_z * c_nbnxnGpuClusterpairSplit * c_nbnxnGpuJgroupSize * sizeof(int);
      /* Warp vote, requires one uint per warp/32 threads per block. */
-    shmem += sizeof(cl_uint) * 2*num_threads_z;
+    shmem += sizeof(cl_uint) * 2 * num_threads_z;
  
      return shmem;
  }
  
-void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
-                                 const InteractionLocality  iloc,
-                                 const int                  numParts)
+/*! \brief
+ * Launch the pairlist prune only kernel for the given locality.
+ * \p numParts tells in how many parts, i.e. calls the list will be pruned.
+ */
+void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
  {
-    cl_atomdata_t       *adat    = nb->atdat;
-    cl_nbparam_t        *nbp     = nb->nbparam;
-    cl_plist_t          *plist   = nb->plist[iloc];
-    cl_timers_t         *t       = nb->timers;
-    cl_command_queue     stream  = nb->stream[iloc];
-    bool                 bDoTime = nb->bDoTime == CL_TRUE;
+    NBAtomDataGpu*      adat         = nb->atdat;
+    NBParamGpu*         nbp          = nb->nbparam;
+    gpu_plist*          plist        = nb->plist[iloc];
+    Nbnxm::GpuTimers*   timers       = nb->timers;
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+    bool                bDoTime      = nb->bDoTime;
  
      if (plist->haveFreshList)
      {
@@ -633,7 +703,8 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
          }
          else
          {
-            GMX_ASSERT(numParts == plist->rollingPruningNumParts, "It is not allowed to change numParts in between list generation steps");
+            GMX_ASSERT(numParts == plist->rollingPruningNumParts,
+                       "It is not allowed to change numParts in between list generation steps");
          }
      }
  
@@ -649,7 +720,7 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
      }
  
      /* Compute the number of list entries to prune in this pass */
-    int numSciInPart = (plist->nsci - part)/numParts;
+    int numSciInPart = (plist->nsci - part) / numParts;
  
      /* Don't launch the kernel if there is no work to do. */
      if (numSciInPart <= 0)
@@ -659,16 +730,17 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
          return;
      }
  
-    GpuRegionTimer *timer = nullptr;
+    GpuRegionTimer* timer = nullptr;
      if (bDoTime)
      {
-        timer = &(plist->haveFreshList ? t->interaction[iloc].prune_k : t->interaction[iloc].rollingPrune_k);
+        timer = &(plist->haveFreshList ? timers->interaction[iloc].prune_k
+                                       : timers->interaction[iloc].rollingPrune_k);
      }
  
      /* beginning of timed prune calculation section */
      if (bDoTime)
      {
-        timer->openTimingRegion(stream);
+        timer->openTimingRegion(deviceStream);
      }
  
      /* Kernel launch config:
@@ -676,43 +748,55 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
       *   and j-cluster concurrency, in x, y, and z, respectively.
       * - The 1D block-grid contains as many blocks as super-clusters.
       */
-    int       num_threads_z = getOclPruneKernelJ4Concurrency(nb->dev_info->vendor_e);
-
+    int num_threads_z = c_pruneKernelJ4Concurrency;
      /* kernel launch config */
      KernelLaunchConfig config;
      config.sharedMemorySize = calc_shmem_required_prune(num_threads_z);
-    config.stream           = stream;
      config.blockSize[0]     = c_clSize;
      config.blockSize[1]     = c_clSize;
      config.blockSize[2]     = num_threads_z;
      config.gridSize[0]      = numSciInPart;
  
-    validate_global_work_size(config, 3, nb->dev_info);
+    validate_global_work_size(config, 3, &nb->deviceContext_->deviceInfo());
  
      if (debug)
      {
-        fprintf(debug, "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
+        fprintf(debug,
+                "Pruning GPU kernel launch configuration:\n\tLocal work size: %zux%zux%zu\n\t"
                  "\tGlobal work size: %zux%zu\n\t#Super-clusters/clusters: %d/%d (%d)\n"
                  "\tShMem: %zu\n",
-                config.blockSize[0], config.blockSize[1], config.blockSize[2],
-                config.blockSize[0] * config.gridSize[0], config.blockSize[1] * config.gridSize[1], plist->nsci*c_numClPerSupercl,
-                c_numClPerSupercl, plist->na_c, config.sharedMemorySize);
-    }
-
-    cl_nbparam_params_t  nbparams_params;
+                config.blockSize[0],
+                config.blockSize[1],
+                config.blockSize[2],
+                config.blockSize[0] * config.gridSize[0],
+                config.blockSize[1] * config.gridSize[1],
+                plist->nsci * c_nbnxnGpuNumClusterPerSupercluster,
+                c_nbnxnGpuNumClusterPerSupercluster,
+                plist->na_c,
+                config.sharedMemorySize);
+    }
+
+    cl_nbparam_params_t nbparams_params;
      fillin_ocl_structures(nbp, &nbparams_params);
  
-    auto          *timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
+    auto*          timingEvent  = bDoTime ? timer->fetchNextEvent() : nullptr;
      constexpr char kernelName[] = "k_pruneonly";
      const auto     pruneKernel  = selectPruneKernel(nb->kernel_pruneonly, plist->haveFreshList);
-    const auto     kernelArgs   = prepareGpuKernelArguments(pruneKernel, config,
-                                                            &nbparams_params, &adat->xq, &adat->shift_vec,
-                                                            &plist->sci, &plist->cj4, &plist->imask, &numParts, &part);
-    launchGpuKernel(pruneKernel, config, timingEvent, kernelName, kernelArgs);
+    const auto     kernelArgs   = prepareGpuKernelArguments(pruneKernel,
+                                                      config,
+                                                      &nbparams_params,
+                                                      &adat->xq,
+                                                      &adat->shiftVec,
+                                                      &plist->sci,
+                                                      &plist->cj4,
+                                                      &plist->imask,
+                                                      &numParts,
+                                                      &part);
+    launchGpuKernel(pruneKernel, config, deviceStream, timingEvent, kernelName, kernelArgs);
  
      if (plist->haveFreshList)
      {
-        plist->haveFreshList         = false;
+        plist->haveFreshList = false;
          /* Mark that pruning has been done */
          nb->timers->interaction[iloc].didPrune = true;
      }
@@ -724,165 +808,8 @@ void gpu_launch_kernel_pruneonly(gmx_nbnxn_gpu_t           *nb,
  
      if (bDoTime)
      {
-        timer->closeTimingRegion(stream);
+        timer->closeTimingRegion(deviceStream);
      }
  }
  
-/*! \brief
- * Launch asynchronously the download of nonbonded forces from the GPU
- * (and energies/shift forces if required).
- */
-void gpu_launch_cpyback(gmx_nbnxn_ocl_t                          *nb,
-                        struct nbnxn_atomdata_t                  *nbatom,
-                        const gmx::StepWorkload                  &stepWork,
-                        const AtomLocality                        aloc,
-                        const bool                     gmx_unused copyBackNbForce)
-{
-    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
-
-    cl_int gmx_unused cl_error;
-    int               adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
-
-    /* determine interaction locality from atom locality */
-    const InteractionLocality iloc = gpuAtomToInteractionLocality(aloc);
-
-    cl_atomdata_t            *adat    = nb->atdat;
-    cl_timers_t              *t       = nb->timers;
-    bool                      bDoTime = nb->bDoTime == CL_TRUE;
-    cl_command_queue          stream  = nb->stream[iloc];
-
-    /* don't launch non-local copy-back if there was no non-local work to do */
-    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
-    {
-        /* TODO An alternative way to signal that non-local work is
-           complete is to use a clEnqueueMarker+clEnqueueBarrier
-           pair. However, the use of bNonLocalStreamActive has the
-           advantage of being local to the host, so probably minimizes
-           overhead. Curiously, for NVIDIA OpenCL with an empty-domain
-           test case, overall simulation performance was higher with
-           the API calls, but this has not been tested on AMD OpenCL,
-           so could be worth considering in future. */
-        nb->bNonLocalStreamActive = CL_FALSE;
-        return;
-    }
-
-    getGpuAtomRange(adat, aloc, &adat_begin, &adat_len);
-
-    /* beginning of timed D2H section */
-    if (bDoTime)
-    {
-        t->xf[aloc].nb_d2h.openTimingRegion(stream);
-    }
-
-    /* With DD the local D2H transfer can only start after the non-local
-       has been launched. */
-    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamActive)
-    {
-        sync_ocl_event(stream, &(nb->nonlocal_done));
-    }
-
-    /* DtoH f */
-    ocl_copy_D2H_async(nbatom->out[0].f.data() + adat_begin * 3, adat->f, adat_begin*3*sizeof(float),
-                       (adat_len)* adat->f_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
-    /* kick off work */
-    cl_error = clFlush(stream);
-    assert(CL_SUCCESS == cl_error);
-
-    /* After the non-local D2H is launched the nonlocal_done event can be
-       recorded which signals that the local D2H can proceed. This event is not
-       placed after the non-local kernel because we first need the non-local
-       data back first. */
-    if (iloc == InteractionLocality::NonLocal)
-    {
-        cl_error = clEnqueueMarkerWithWaitList(stream, 0, nullptr, &(nb->nonlocal_done));
-        assert(CL_SUCCESS == cl_error);
-        nb->bNonLocalStreamActive = CL_TRUE;
-    }
-
-    /* only transfer energies in the local stream */
-    if (iloc == InteractionLocality::Local)
-    {
-        /* DtoH fshift when virial is needed */
-        if (stepWork.computeVirial)
-        {
-            ocl_copy_D2H_async(nb->nbst.fshift, adat->fshift, 0,
-                               SHIFTS * adat->fshift_elem_size, stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-        }
-
-        /* DtoH energies */
-        if (stepWork.computeEnergy)
-        {
-            ocl_copy_D2H_async(nb->nbst.e_lj, adat->e_lj, 0,
-                               sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-
-            ocl_copy_D2H_async(nb->nbst.e_el, adat->e_el, 0,
-                               sizeof(float), stream, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-        }
-    }
-
-    if (bDoTime)
-    {
-        t->xf[aloc].nb_d2h.closeTimingRegion(stream);
-    }
-}
-
-
-/*! \brief Selects the Ewald kernel type, analytical or tabulated, single or twin cut-off. */
-int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t &ic)
-{
-    bool bTwinCut = (ic.rcoulomb != ic.rvdw);
-    bool bUseAnalyticalEwald, bForceAnalyticalEwald, bForceTabulatedEwald;
-    int  kernel_type;
-
-    /* Benchmarking/development environment variables to force the use of
-       analytical or tabulated Ewald kernel. */
-    bForceAnalyticalEwald = (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
-    bForceTabulatedEwald  = (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
-
-    if (bForceAnalyticalEwald && bForceTabulatedEwald)
-    {
-        gmx_incons("Both analytical and tabulated Ewald OpenCL non-bonded kernels "
-                   "requested through environment variables.");
-    }
-
-    /* OpenCL: By default, use analytical Ewald
-     * TODO: tabulated does not work, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
-     *
-     * TODO: decide if dev_info parameter should be added to recognize NVIDIA CC>=3.0 devices.
-     *
-     */
-    /* By default use analytical Ewald. */
-    bUseAnalyticalEwald = true;
-    if (bForceAnalyticalEwald)
-    {
-        if (debug)
-        {
-            fprintf(debug, "Using analytical Ewald OpenCL kernels\n");
-        }
-    }
-    else if (bForceTabulatedEwald)
-    {
-        bUseAnalyticalEwald = false;
-
-        if (debug)
-        {
-            fprintf(debug, "Using tabulated Ewald OpenCL kernels\n");
-        }
-    }
-
-    /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
-       forces it (use it for debugging/benchmarking only). */
-    if (!bTwinCut && (getenv("GMX_OCL_NB_EWALD_TWINCUT") == nullptr))
-    {
-        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA : eelOclEWALD_TAB;
-    }
-    else
-    {
-        kernel_type = bUseAnalyticalEwald ? eelOclEWALD_ANA_TWIN : eelOclEWALD_TAB_TWIN;
-    }
-
-    return kernel_type;
-}
-
  } // namespace Nbnxm