Unify handling of GMX_ENABLE_GPU_TIMING and GMX_DISABLE_GPU_TIMING

[alexxy/gromacs.git] / src / gromacs / nbnxm / nbnxm_gpu_data_mgmt.cpp
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

index 3fd85ae473c54b0cb36b086cc598a6a52acb9400..2263593cb842d5e43f302c83f4584a371ad908f7 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -2,7 +2,7 @@
   * This file is part of the GROMACS molecular simulation package.
   *
   * Copyright (c) 2012,2013,2014,2015,2016 by the GROMACS development team.
- * Copyright (c) 2017,2018,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2017,2018,2019,2020,2021, by the GROMACS development team, led by
   * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   * and including many others, as listed in the AUTHORS file in the
   * top-level source directory and at http://www.gromacs.org.
@@ -56,11 +56,27 @@
  #    include "opencl/nbnxm_ocl_types.h"
  #endif
  
+#if GMX_GPU_SYCL
+#    include "sycl/nbnxm_sycl_types.h"
+#endif
+
  #include "nbnxm_gpu_data_mgmt.h"
  
+#include "gromacs/gpu_utils/device_stream_manager.h"
+#include "gromacs/gpu_utils/gputraits.h"
+#include "gromacs/gpu_utils/pmalloc.h"
+#include "gromacs/hardware/device_information.h"
+#include "gromacs/mdtypes/interaction_const.h"
+#include "gromacs/mdtypes/simulation_workload.h"
+#include "gromacs/nbnxm/gpu_common_utils.h"
  #include "gromacs/nbnxm/gpu_data_mgmt.h"
+#include "gromacs/nbnxm/gridset.h"
+#include "gromacs/pbcutil/ishift.h"
  #include "gromacs/timing/gpu_timing.h"
+#include "gromacs/pbcutil/ishift.h"
  #include "gromacs/utility/cstringutil.h"
+#include "gromacs/utility/exceptions.h"
+#include "gromacs/utility/fatalerror.h"
  
  #include "nbnxm_gpu.h"
  #include "pairlistsets.h"
@@ -68,57 +84,47 @@
  namespace Nbnxm
  {
  
-void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
-                                    NBParamGpu*                  nbp,
-                                    const DeviceContext&         deviceContext)
+static inline void issueClFlushInStream(const DeviceStream& deviceStream)
  {
-    if (!nbp->coulomb_tab)
+#if GMX_GPU_OPENCL
+    /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
+     * in the stream after marking an event in it in order to be able to sync with
+     * the event from another stream.
+     */
+    cl_int cl_error = clFlush(deviceStream.stream());
+    if (cl_error != CL_SUCCESS)
      {
-        destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
+        GMX_THROW(gmx::InternalError("clFlush failed: " + ocl_get_error_string(cl_error)));
      }
-
-    nbp->coulomb_tab_scale = tables.scale;
-    initParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(),
-                         tables.tableF.size(), deviceContext);
+#else
+    GMX_UNUSED_VALUE(deviceStream);
+#endif
  }
  
-void inline printEnviromnentVariableDeprecationMessage(bool               isEnvironmentVariableSet,
-                                                       const std::string& environmentVariableSuffix)
+static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
+                                                  NBParamGpu*                  nbp,
+                                                  const DeviceContext&         deviceContext)
  {
-    if (isEnvironmentVariableSet)
+    if (nbp->coulomb_tab)
      {
-        fprintf(stderr,
-                "Environment variables GMX_CUDA_%s and GMX_OCL_%s are deprecated and will be\n"
-                "removed in release 2022, please use GMX_GPU_%s instead.",
-                environmentVariableSuffix.c_str(), environmentVariableSuffix.c_str(),
-                environmentVariableSuffix.c_str());
+        destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
      }
+
+    nbp->coulomb_tab_scale = tables.scale;
+    initParamLookupTable(
+            &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
  }
  
-int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic)
+static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
+                                                        const DeviceInformation gmx_unused& deviceInfo)
  {
      bool bTwinCut = (ic.rcoulomb != ic.rvdw);
-    int  kernel_type;
  
      /* Benchmarking/development environment variables to force the use of
         analytical or tabulated Ewald kernel. */
-
-    // Remove these when old environment variables are deprecated
-    const bool forceAnalyticalEwaldLegacy = (getenv("GMX_CUDA_NB_ANA_EWALD") != nullptr)
-                                            || (getenv("GMX_OCL_NB_ANA_EWALD") != nullptr);
-    const bool forceTabulatedEwaldLegacy = (getenv("GMX_CUDA_NB_TAB_EWALD") != nullptr)
-                                           || (getenv("GMX_OCL_NB_TAB_EWALD") != nullptr);
-    const bool forceTwinCutoffEwaldLegacy = (getenv("GMX_CUDA_NB_EWALD_TWINCUT") != nullptr)
-                                            || (getenv("GMX_OCL_NB_EWALD_TWINCUT") != nullptr);
-
-    printEnviromnentVariableDeprecationMessage(forceAnalyticalEwaldLegacy, "NB_ANA_EWALD");
-    printEnviromnentVariableDeprecationMessage(forceTabulatedEwaldLegacy, "NB_TAB_EWALD");
-    printEnviromnentVariableDeprecationMessage(forceTwinCutoffEwaldLegacy, "NB_EWALD_TWINCUT");
-
-    const bool forceAnalyticalEwald =
-            (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr) || forceAnalyticalEwaldLegacy;
-    const bool forceTabulatedEwald =
-            (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr) || forceTabulatedEwaldLegacy;
+    const bool forceAnalyticalEwald = (getenv("GMX_GPU_NB_ANA_EWALD") != nullptr);
+    const bool forceTabulatedEwald  = (getenv("GMX_GPU_NB_TAB_EWALD") != nullptr);
+    const bool forceTwinCutoffEwald = (getenv("GMX_GPU_NB_EWALD_TWINCUT") != nullptr);
  
      if (forceAnalyticalEwald && forceTabulatedEwald)
      {
@@ -127,13 +133,19 @@ int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic)
                  "requested through environment variables.");
      }
  
-    /* By default, use analytical Ewald
-     * TODO: tabulated does not work in OpenCL, it needs fixing, see init_nbparam() in nbnxn_ocl_data_mgmt.cpp
-     *
+    /* By default, use analytical Ewald except with CUDA on NVIDIA CC 7.0 and 8.0.
       */
-    bool bUseAnalyticalEwald = true;
+    const bool c_useTabulatedEwaldDefault =
+#if GMX_GPU_CUDA
+            (deviceInfo.prop.major == 7 && deviceInfo.prop.minor == 0)
+            || (deviceInfo.prop.major == 8 && deviceInfo.prop.minor == 0);
+#else
+            false;
+#endif
+    bool bUseAnalyticalEwald = !c_useTabulatedEwaldDefault;
      if (forceAnalyticalEwald)
      {
+        bUseAnalyticalEwald = true;
          if (debug)
          {
              fprintf(debug, "Using analytical Ewald GPU kernels\n");
@@ -151,58 +163,41 @@ int nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic)
  
      /* Use twin cut-off kernels if requested by bTwinCut or the env. var.
         forces it (use it for debugging/benchmarking only). */
-    if (!bTwinCut && ((getenv("GMX_GPU_NB_EWALD_TWINCUT") == nullptr) || forceTwinCutoffEwaldLegacy))
+    if (!bTwinCut && !forceTwinCutoffEwald)
      {
-        kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA : eelTypeEWALD_TAB;
+        return bUseAnalyticalEwald ? ElecType::EwaldAna : ElecType::EwaldTab;
      }
      else
      {
-        kernel_type = bUseAnalyticalEwald ? eelTypeEWALD_ANA_TWIN : eelTypeEWALD_TAB_TWIN;
+        return bUseAnalyticalEwald ? ElecType::EwaldAnaTwin : ElecType::EwaldTabTwin;
      }
-
-    return kernel_type;
  }
  
-void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
+static inline void set_cutoff_parameters(NBParamGpu*                nbp,
+                                         const interaction_const_t& ic,
+                                         const PairlistParams&      listParams)
  {
-    nbp->ewald_beta        = ic->ewaldcoeff_q;
-    nbp->sh_ewald          = ic->sh_ewald;
-    nbp->epsfac            = ic->epsfac;
-    nbp->two_k_rf          = 2.0 * ic->k_rf;
-    nbp->c_rf              = ic->c_rf;
-    nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
-    nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
+    nbp->ewald_beta        = ic.ewaldcoeff_q;
+    nbp->sh_ewald          = ic.sh_ewald;
+    nbp->epsfac            = ic.epsfac;
+    nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
+    nbp->c_rf              = ic.reactionFieldShift;
+    nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
+    nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
      nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
      nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
      nbp->useDynamicPruning = listParams.useDynamicPruning;
  
-    nbp->sh_lj_ewald   = ic->sh_lj_ewald;
-    nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
+    nbp->sh_lj_ewald   = ic.sh_lj_ewald;
+    nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
  
-    nbp->rvdw_switch      = ic->rvdw_switch;
-    nbp->dispersion_shift = ic->dispersion_shift;
-    nbp->repulsion_shift  = ic->repulsion_shift;
-    nbp->vdw_switch       = ic->vdw_switch;
+    nbp->rvdw_switch      = ic.rvdw_switch;
+    nbp->dispersion_shift = ic.dispersion_shift;
+    nbp->repulsion_shift  = ic.repulsion_shift;
+    nbp->vdw_switch       = ic.vdw_switch;
  }
  
-void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
-{
-    if (!nbv || !nbv->useGpu())
-    {
-        return;
-    }
-    NbnxmGpu*   nb  = nbv->gpu_nbv;
-    NBParamGpu* nbp = nb->nbparam;
-
-    set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
-
-    nbp->eeltype = nbnxn_gpu_pick_ewald_kernel_type(*ic);
-
-    GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-    init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
-}
-
-void init_plist(gpu_plist* pl)
+static inline void init_plist(gpu_plist* pl)
  {
      /* initialize to nullptr pointers to data that is not allocated here and will
         need reallocation in nbnxn_gpu_init_pairlist */
@@ -212,30 +207,30 @@ void init_plist(gpu_plist* pl)
      pl->excl  = nullptr;
  
      /* size -1 indicates that the respective array hasn't been initialized yet */
-    pl->na_c          = -1;
-    pl->nsci          = -1;
-    pl->sci_nalloc    = -1;
-    pl->ncj4          = -1;
-    pl->cj4_nalloc    = -1;
-    pl->nimask        = -1;
-    pl->imask_nalloc  = -1;
-    pl->nexcl         = -1;
-    pl->excl_nalloc   = -1;
-    pl->haveFreshList = false;
+    pl->na_c                   = -1;
+    pl->nsci                   = -1;
+    pl->sci_nalloc             = -1;
+    pl->ncj4                   = -1;
+    pl->cj4_nalloc             = -1;
+    pl->nimask                 = -1;
+    pl->imask_nalloc           = -1;
+    pl->nexcl                  = -1;
+    pl->excl_nalloc            = -1;
+    pl->haveFreshList          = false;
+    pl->rollingPruningNumParts = 0;
+    pl->rollingPruningPart     = 0;
  }
  
-void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
+static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
  {
-    int i, j;
-
      t->nb_h2d_t = 0.0;
      t->nb_d2h_t = 0.0;
      t->nb_c     = 0;
      t->pl_h2d_t = 0.0;
      t->pl_h2d_c = 0;
-    for (i = 0; i < 2; i++)
+    for (int i = 0; i < 2; i++)
      {
-        for (j = 0; j < 2; j++)
+        for (int j = 0; j < 2; j++)
          {
              t->ktime[i][j].t = 0.0;
              t->ktime[i][j].c = 0;
@@ -247,6 +242,279 @@ void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
      t->dynamicPruneTime.t = 0.0;
  }
  
+/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
+static inline void initAtomdataFirst(NBAtomDataGpu*       atomdata,
+                                     int                  numTypes,
+                                     const DeviceContext& deviceContext,
+                                     const DeviceStream&  localStream)
+{
+    atomdata->numTypes = numTypes;
+    allocateDeviceBuffer(&atomdata->shiftVec, gmx::c_numShiftVectors, deviceContext);
+    atomdata->shiftVecUploaded = false;
+
+    allocateDeviceBuffer(&atomdata->fShift, gmx::c_numShiftVectors, deviceContext);
+    allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
+    allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
+
+    clearDeviceBufferAsync(&atomdata->fShift, 0, gmx::c_numShiftVectors, localStream);
+    clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
+    clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
+
+    /* initialize to nullptr pointers to data that is not allocated here and will
+       need reallocation in later */
+    atomdata->xq = nullptr;
+    atomdata->f  = nullptr;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    atomdata->numAtoms      = -1;
+    atomdata->numAtomsAlloc = -1;
+}
+
+static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
+                                                LJCombinationRule          ljCombinationRule)
+{
+    if (ic.vdwtype == VanDerWaalsType::Cut)
+    {
+        switch (ic.vdw_modifier)
+        {
+            case InteractionModifiers::None:
+            case InteractionModifiers::PotShift:
+                switch (ljCombinationRule)
+                {
+                    case LJCombinationRule::None: return VdwType::Cut;
+                    case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
+                    case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
+                    default:
+                        GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
+                                "The requested LJ combination rule %s is not implemented in "
+                                "the GPU accelerated kernels!",
+                                enumValueToString(ljCombinationRule))));
+                }
+            case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
+            case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
+            default:
+                GMX_THROW(gmx::InconsistentInputError(
+                        gmx::formatString("The requested VdW interaction modifier %s is not "
+                                          "implemented in the GPU accelerated kernels!",
+                                          enumValueToString(ic.vdw_modifier))));
+        }
+    }
+    else if (ic.vdwtype == VanDerWaalsType::Pme)
+    {
+        if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
+        {
+            GMX_RELEASE_ASSERT(
+                    ljCombinationRule == LJCombinationRule::Geometric,
+                    "Combination rules for long- and short-range interactions should match.");
+            return VdwType::EwaldGeom;
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(
+                    ljCombinationRule == LJCombinationRule::LorentzBerthelot,
+                    "Combination rules for long- and short-range interactions should match.");
+            return VdwType::EwaldLB;
+        }
+    }
+    else
+    {
+        GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
+                "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
+                enumValueToString(ic.vdwtype))));
+    }
+}
+
+static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
+                                                            const DeviceInformation&   deviceInfo)
+{
+    if (ic.eeltype == CoulombInteractionType::Cut)
+    {
+        return ElecType::Cut;
+    }
+    else if (EEL_RF(ic.eeltype))
+    {
+        return ElecType::RF;
+    }
+    else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
+    {
+        return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
+    }
+    else
+    {
+        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+        GMX_THROW(gmx::InconsistentInputError(
+                gmx::formatString("The requested electrostatics type %s is not implemented in "
+                                  "the GPU accelerated kernels!",
+                                  enumValueToString(ic.eeltype))));
+    }
+}
+
+/*! \brief Initialize the nonbonded parameter data structure. */
+static inline void initNbparam(NBParamGpu*                     nbp,
+                               const interaction_const_t&      ic,
+                               const PairlistParams&           listParams,
+                               const nbnxn_atomdata_t::Params& nbatParams,
+                               const DeviceContext&            deviceContext)
+{
+    const int numTypes = nbatParams.numTypes;
+
+    set_cutoff_parameters(nbp, ic, listParams);
+
+    nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
+    nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
+
+    if (ic.vdwtype == VanDerWaalsType::Pme)
+    {
+        if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
+        {
+            GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
+                       "Combination rule mismatch!");
+        }
+        else
+        {
+            GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
+                       "Combination rule mismatch!");
+        }
+    }
+
+    /* generate table for PME */
+    if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
+    {
+        GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+        init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
+    }
+
+    /* set up LJ parameter lookup table */
+    if (!useLjCombRule(nbp->vdwType))
+    {
+        static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp,
+                             &nbp->nbfp_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+                             numTypes * numTypes,
+                             deviceContext);
+    }
+
+    /* set up LJ-PME parameter lookup table */
+    if (ic.vdwtype == VanDerWaalsType::Pme)
+    {
+        static_assert(sizeof(decltype(nbp->nbfp_comb))
+                              == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp_comb,
+                             &nbp->nbfp_comb_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+                             numTypes,
+                             deviceContext);
+    }
+}
+
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   const bool                      bLocalAndNonlocal)
+{
+    auto* nb                              = new NbnxmGpu();
+    nb->deviceContext_                    = &deviceStreamManager.context();
+    nb->atdat                             = new NBAtomDataGpu;
+    nb->nbparam                           = new NBParamGpu;
+    nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
+    if (bLocalAndNonlocal)
+    {
+        nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
+    }
+
+    nb->bUseTwoStreams = bLocalAndNonlocal;
+
+    nb->timers = new Nbnxm::GpuTimers();
+    snew(nb->timings, 1);
+
+    nb->bDoTime = decideGpuTimingsUsage();
+
+    if (nb->bDoTime)
+    {
+        init_timings(nb->timings);
+    }
+
+    /* init nbst */
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), gmx::c_numShiftVectors * sizeof(*nb->nbst.fShift));
+
+    init_plist(nb->plist[InteractionLocality::Local]);
+
+    /* local/non-local GPU streams */
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    nb->deviceStreams[InteractionLocality::Local] = &localStream;
+    // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
+    // out-of-order. But for the time being, it will be less disruptive to keep them.
+    if (nb->bUseTwoStreams)
+    {
+        init_plist(nb->plist[InteractionLocality::NonLocal]);
+
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+    }
+
+    const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
+    const DeviceContext&            deviceContext = *nb->deviceContext_;
+
+    initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
+    initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
+
+    gpu_init_platform_specific(nb);
+
+    if (debug)
+    {
+        fprintf(debug, "Initialized NBNXM GPU data structures.\n");
+    }
+
+    return nb;
+}
+
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
+{
+    if (!nbv || !nbv->useGpu())
+    {
+        return;
+    }
+    NbnxmGpu*   nb  = nbv->gpu_nbv;
+    NBParamGpu* nbp = nb->nbparam;
+
+    set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
+
+    nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
+
+    GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+    init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
+}
+
+void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
+{
+    NBAtomDataGpu*      adat        = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+
+    /* only if we have a dynamic box */
+    if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
+    {
+        copyToDeviceBuffer(&adat->shiftVec,
+                           gmx::asGenericFloat3Pointer(nbatom->shift_vec),
+                           0,
+                           gmx::c_numShiftVectors,
+                           localStream,
+                           GpuApiCallBehavior::Async,
+                           nullptr);
+        adat->shiftVecUploaded = true;
+    }
+}
+
  //! This function is documented in the header file
  void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
  {
@@ -266,13 +534,15 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
      {
          if (d_plist->na_c != h_plist->na_ci)
          {
-            sprintf(sbuf, "In init_plist: the #atoms per cell has changed (from %d to %d)",
-                    d_plist->na_c, h_plist->na_ci);
+            sprintf(sbuf,
+                    "In init_plist: the #atoms per cell has changed (from %d to %d)",
+                    d_plist->na_c,
+                    h_plist->na_ci);
              gmx_incons(sbuf);
          }
      }
  
-    gpu_timers_t::Interaction& iTimers = nb->timers->interaction[iloc];
+    GpuTimers::Interaction& iTimers = nb->timers->interaction[iloc];
  
      if (bDoTime)
      {
@@ -283,23 +553,41 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
      // TODO most of this function is same in CUDA and OpenCL, move into the header
      const DeviceContext& deviceContext = *nb->deviceContext_;
  
-    reallocateDeviceBuffer(&d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc,
-                           deviceContext);
-    copyToDeviceBuffer(&d_plist->sci, h_plist->sci.data(), 0, h_plist->sci.size(), deviceStream,
-                       GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+    reallocateDeviceBuffer(
+            &d_plist->sci, h_plist->sci.size(), &d_plist->nsci, &d_plist->sci_nalloc, deviceContext);
+    copyToDeviceBuffer(&d_plist->sci,
+                       h_plist->sci.data(),
+                       0,
+                       h_plist->sci.size(),
+                       deviceStream,
+                       GpuApiCallBehavior::Async,
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
-    reallocateDeviceBuffer(&d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc,
-                           deviceContext);
-    copyToDeviceBuffer(&d_plist->cj4, h_plist->cj4.data(), 0, h_plist->cj4.size(), deviceStream,
-                       GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+    reallocateDeviceBuffer(
+            &d_plist->cj4, h_plist->cj4.size(), &d_plist->ncj4, &d_plist->cj4_nalloc, deviceContext);
+    copyToDeviceBuffer(&d_plist->cj4,
+                       h_plist->cj4.data(),
+                       0,
+                       h_plist->cj4.size(),
+                       deviceStream,
+                       GpuApiCallBehavior::Async,
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
-    reallocateDeviceBuffer(&d_plist->imask, h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
-                           &d_plist->nimask, &d_plist->imask_nalloc, deviceContext);
+    reallocateDeviceBuffer(&d_plist->imask,
+                           h_plist->cj4.size() * c_nbnxnGpuClusterpairSplit,
+                           &d_plist->nimask,
+                           &d_plist->imask_nalloc,
+                           deviceContext);
  
-    reallocateDeviceBuffer(&d_plist->excl, h_plist->excl.size(), &d_plist->nexcl,
-                           &d_plist->excl_nalloc, deviceContext);
-    copyToDeviceBuffer(&d_plist->excl, h_plist->excl.data(), 0, h_plist->excl.size(), deviceStream,
-                       GpuApiCallBehavior::Async, bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
+    reallocateDeviceBuffer(
+            &d_plist->excl, h_plist->excl.size(), &d_plist->nexcl, &d_plist->excl_nalloc, deviceContext);
+    copyToDeviceBuffer(&d_plist->excl,
+                       h_plist->excl.data(),
+                       0,
+                       h_plist->excl.size(),
+                       deviceStream,
+                       GpuApiCallBehavior::Async,
+                       bDoTime ? iTimers.pl_h2d.fetchNextEvent() : nullptr);
  
      if (bDoTime)
      {
@@ -310,6 +598,122 @@ void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const Inte
      d_plist->haveFreshList = true;
  }
  
+void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
+{
+    bool                 bDoTime       = nb->bDoTime;
+    Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
+    NBAtomDataGpu*       atdat         = nb->atdat;
+    const DeviceContext& deviceContext = *nb->deviceContext_;
+    const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
+
+    int  numAtoms  = nbat->numAtoms();
+    bool realloced = false;
+
+    if (bDoTime)
+    {
+        /* time async copy */
+        timers->atdat.openTimingRegion(localStream);
+    }
+
+    /* need to reallocate if we have to copy more atoms than the amount of space
+       available and only allocate if we haven't initialized yet, i.e atdat->natoms == -1 */
+    if (numAtoms > atdat->numAtomsAlloc)
+    {
+        int numAlloc = over_alloc_small(numAtoms);
+
+        /* free up first if the arrays have already been initialized */
+        if (atdat->numAtomsAlloc != -1)
+        {
+            freeDeviceBuffer(&atdat->f);
+            freeDeviceBuffer(&atdat->xq);
+            if (useLjCombRule(nb->nbparam->vdwType))
+            {
+                freeDeviceBuffer(&atdat->ljComb);
+            }
+            else
+            {
+                freeDeviceBuffer(&atdat->atomTypes);
+            }
+        }
+
+
+        allocateDeviceBuffer(&atdat->f, numAlloc, deviceContext);
+        allocateDeviceBuffer(&atdat->xq, numAlloc, deviceContext);
+
+        if (useLjCombRule(nb->nbparam->vdwType))
+        {
+            // Two Lennard-Jones parameters per atom
+            allocateDeviceBuffer(&atdat->ljComb, numAlloc, deviceContext);
+        }
+        else
+        {
+            allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
+        }
+
+        atdat->numAtomsAlloc = numAlloc;
+        realloced            = true;
+    }
+
+    atdat->numAtoms      = numAtoms;
+    atdat->numAtomsLocal = nbat->natoms_local;
+
+    /* need to clear GPU f output if realloc happened */
+    if (realloced)
+    {
+        clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
+    }
+
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        static_assert(
+                sizeof(Float2) == 2 * sizeof(*nbat->params().lj_comb.data()),
+                "Size of a pair of LJ parameters elements should be equal to the size of Float2.");
+        copyToDeviceBuffer(&atdat->ljComb,
+                           reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
+                           0,
+                           numAtoms,
+                           localStream,
+                           GpuApiCallBehavior::Async,
+                           bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+    }
+    else
+    {
+        static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
+                      "Sizes of host- and device-side atom types should be the same.");
+        copyToDeviceBuffer(&atdat->atomTypes,
+                           nbat->params().type.data(),
+                           0,
+                           numAtoms,
+                           localStream,
+                           GpuApiCallBehavior::Async,
+                           bDoTime ? timers->atdat.fetchNextEvent() : nullptr);
+    }
+
+    if (bDoTime)
+    {
+        timers->atdat.closeTimingRegion(localStream);
+    }
+
+    /* kick off the tasks enqueued above to ensure concurrency with the search */
+    issueClFlushInStream(localStream);
+}
+
+void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
+{
+    NBAtomDataGpu*      adat        = nb->atdat;
+    const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
+    // Clear forces
+    clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
+    // Clear shift force array and energies if the outputs were used in the current step
+    if (computeVirial)
+    {
+        clearDeviceBufferAsync(&adat->fShift, 0, gmx::c_numShiftVectors, localStream);
+        clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
+        clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
+    }
+    issueClFlushInStream(localStream);
+}
+
  //! This function is documented in the header file
  gmx_wallclock_gpu_nbnxn_t* gpu_get_timings(NbnxmGpu* nb)
  {
@@ -327,7 +731,453 @@ void gpu_reset_timings(nonbonded_verlet_t* nbv)
  
  bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
  {
-    return ((nb->nbparam->eeltype == eelTypeEWALD_ANA) || (nb->nbparam->eeltype == eelTypeEWALD_ANA_TWIN));
+    return ((nb->nbparam->elecType == ElecType::EwaldAna)
+            || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
+}
+
+void setupGpuShortRangeWork(NbnxmGpu*                      nb,
+                            const gmx::ListedForcesGpu*    listedForcesGpu,
+                            const gmx::InteractionLocality iLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    // There is short-range work if the pair list for the provided
+    // interaction locality contains entries or if there is any
+    // bonded work (as this is not split into local/nonlocal).
+    nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
+                               || (listedForcesGpu != nullptr && listedForcesGpu->haveInteractions()));
+}
+
+bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    return nb->haveWork[interactionLocality];
+}
+
+/*! \brief
+ * Launch asynchronously the download of nonbonded forces from the GPU
+ * (and energies/shift forces if required).
+ */
+void gpu_launch_cpyback(NbnxmGpu*                nb,
+                        struct nbnxn_atomdata_t* nbatom,
+                        const gmx::StepWorkload& stepWork,
+                        const AtomLocality       atomLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    /* determine interaction locality from atom locality */
+    const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
+    GMX_ASSERT(iloc == InteractionLocality::Local
+                       || (iloc == InteractionLocality::NonLocal && nb->bNonLocalStreamDoneMarked == false),
+               "Non-local stream is indicating that the copy back event is enqueued at the "
+               "beginning of the copy back function.");
+
+    /* extract the data */
+    NBAtomDataGpu*      adat         = nb->atdat;
+    Nbnxm::GpuTimers*   timers       = nb->timers;
+    bool                bDoTime      = nb->bDoTime;
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+    /* don't launch non-local copy-back if there was no non-local work to do */
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
+    {
+        /* TODO An alternative way to signal that non-local work is
+           complete is to use a clEnqueueMarker+clEnqueueBarrier
+           pair. However, the use of bNonLocalStreamDoneMarked has the
+           advantage of being local to the host, so probably minimizes
+           overhead. Curiously, for NVIDIA OpenCL with an empty-domain
+           test case, overall simulation performance was higher with
+           the API calls, but this has not been tested on AMD OpenCL,
+           so could be worth considering in future. */
+        nb->bNonLocalStreamDoneMarked = false;
+        return;
+    }
+
+    /* local/nonlocal offset and length used for xq and f */
+    auto atomsRange = getGpuAtomRange(adat, atomLocality);
+
+    /* beginning of timed D2H section */
+    if (bDoTime)
+    {
+        timers->xf[atomLocality].nb_d2h.openTimingRegion(deviceStream);
+    }
+
+    /* With DD the local D2H transfer can only start after the non-local
+       has been launched. */
+    if (iloc == InteractionLocality::Local && nb->bNonLocalStreamDoneMarked)
+    {
+        nb->nonlocal_done.enqueueWaitEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = false;
+    }
+
+    /* DtoH f */
+    if (!stepWork.useGpuFBufferOps)
+    {
+        static_assert(
+                sizeof(*nbatom->out[0].f.data()) == sizeof(float),
+                "The host force buffer should be in single precision to match device data size.");
+        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
+                             &adat->f,
+                             atomsRange.begin(),
+                             atomsRange.size(),
+                             deviceStream,
+                             GpuApiCallBehavior::Async,
+                             bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+
+        issueClFlushInStream(deviceStream);
+    }
+
+    /* After the non-local D2H is launched the nonlocal_done event can be
+       recorded which signals that the local D2H can proceed. This event is not
+       placed after the non-local kernel because we first need the non-local
+       data back first. */
+    if (iloc == InteractionLocality::NonLocal)
+    {
+        nb->nonlocal_done.markEvent(deviceStream);
+        nb->bNonLocalStreamDoneMarked = true;
+    }
+
+    /* only transfer energies in the local stream */
+    if (iloc == InteractionLocality::Local)
+    {
+        /* DtoH fshift when virial is needed */
+        if (stepWork.computeVirial)
+        {
+            static_assert(
+                    sizeof(*nb->nbst.fShift) == sizeof(Float3),
+                    "Sizes of host- and device-side shift vector elements should be the same.");
+            copyFromDeviceBuffer(nb->nbst.fShift,
+                                 &adat->fShift,
+                                 0,
+                                 gmx::c_numShiftVectors,
+                                 deviceStream,
+                                 GpuApiCallBehavior::Async,
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+        }
+
+        /* DtoH energies */
+        if (stepWork.computeEnergy)
+        {
+            static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
+                          "Sizes of host- and device-side LJ energy terms should be the same.");
+            copyFromDeviceBuffer(nb->nbst.eLJ,
+                                 &adat->eLJ,
+                                 0,
+                                 1,
+                                 deviceStream,
+                                 GpuApiCallBehavior::Async,
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+            static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
+                          "Sizes of host- and device-side electrostatic energy terms should be the "
+                          "same.");
+            copyFromDeviceBuffer(nb->nbst.eElec,
+                                 &adat->eElec,
+                                 0,
+                                 1,
+                                 deviceStream,
+                                 GpuApiCallBehavior::Async,
+                                 bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+        }
+    }
+
+    if (bDoTime)
+    {
+        timers->xf[atomLocality].nb_d2h.closeTimingRegion(deviceStream);
+    }
+}
+
+void nbnxnInsertNonlocalGpuDependency(NbnxmGpu* nb, const InteractionLocality interactionLocality)
+{
+    const DeviceStream& deviceStream = *nb->deviceStreams[interactionLocality];
+
+    /* When we get here all misc operations issued in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    if (nb->bUseTwoStreams)
+    {
+        if (interactionLocality == InteractionLocality::Local)
+        {
+            nb->misc_ops_and_local_H2D_done.markEvent(deviceStream);
+            issueClFlushInStream(deviceStream);
+        }
+        else
+        {
+            nb->misc_ops_and_local_H2D_done.enqueueWaitEvent(deviceStream);
+        }
+    }
+}
+
+/*! \brief Launch asynchronously the xq buffer host to device copy. */
+void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const AtomLocality atomLocality)
+{
+    GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
+
+    const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
+
+    NBAtomDataGpu*      adat         = nb->atdat;
+    gpu_plist*          plist        = nb->plist[iloc];
+    Nbnxm::GpuTimers*   timers       = nb->timers;
+    const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
+
+    const bool bDoTime = nb->bDoTime;
+
+    /* Don't launch the non-local H2D copy if there is no dependent
+       work to do: neither non-local nor other (e.g. bonded) work
+       to do that has as input the nbnxn coordaintes.
+       Doing the same for the local kernel is more complicated, since the
+       local part of the force array also depends on the non-local kernel.
+       So to avoid complicating the code and to reduce the risk of bugs,
+       we always call the local local x+q copy (and the rest of the local
+       work in nbnxn_gpu_launch_kernel().
+     */
+    if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(nb, iloc))
+    {
+        plist->haveFreshList = false;
+
+        // The event is marked for Local interactions unconditionally,
+        // so it has to be released here because of the early return
+        // for NonLocal interactions.
+        nb->misc_ops_and_local_H2D_done.reset();
+
+        return;
+    }
+
+    /* local/nonlocal offset and length used for xq and f */
+    const auto atomsRange = getGpuAtomRange(adat, atomLocality);
+
+    /* beginning of timed HtoD section */
+    if (bDoTime)
+    {
+        timers->xf[atomLocality].nb_h2d.openTimingRegion(deviceStream);
+    }
+
+    /* HtoD x, q */
+    GMX_ASSERT(nbatom->XFormat == nbatXYZQ,
+               "The coordinates should be in xyzq format to copy to the Float4 device buffer.");
+    copyToDeviceBuffer(&adat->xq,
+                       reinterpret_cast<const Float4*>(nbatom->x().data()) + atomsRange.begin(),
+                       atomsRange.begin(),
+                       atomsRange.size(),
+                       deviceStream,
+                       GpuApiCallBehavior::Async,
+                       nullptr);
+
+    if (bDoTime)
+    {
+        timers->xf[atomLocality].nb_h2d.closeTimingRegion(deviceStream);
+    }
+
+    /* When we get here all misc operations issued in the local stream as well as
+       the local xq H2D are done,
+       so we record that in the local stream and wait for it in the nonlocal one.
+       This wait needs to precede any PP tasks, bonded or nonbonded, that may
+       compute on interactions between local and nonlocal atoms.
+     */
+    nbnxnInsertNonlocalGpuDependency(nb, iloc);
+}
+
+
+/* Initialization for X buffer operations on GPU. */
+void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv)
+{
+    const DeviceStream& localStream   = *gpu_nbv->deviceStreams[InteractionLocality::Local];
+    const bool          bDoTime       = gpu_nbv->bDoTime;
+    const int           maxNumColumns = gridSet.numColumnsMax();
+
+    reallocateDeviceBuffer(&gpu_nbv->cxy_na,
+                           maxNumColumns * gridSet.grids().size(),
+                           &gpu_nbv->ncxy_na,
+                           &gpu_nbv->ncxy_na_alloc,
+                           *gpu_nbv->deviceContext_);
+    reallocateDeviceBuffer(&gpu_nbv->cxy_ind,
+                           maxNumColumns * gridSet.grids().size(),
+                           &gpu_nbv->ncxy_ind,
+                           &gpu_nbv->ncxy_ind_alloc,
+                           *gpu_nbv->deviceContext_);
+
+    for (unsigned int g = 0; g < gridSet.grids().size(); g++)
+    {
+        const Nbnxm::Grid& grid = gridSet.grids()[g];
+
+        const int  numColumns      = grid.numColumns();
+        const int* atomIndices     = gridSet.atomIndices().data();
+        const int  atomIndicesSize = gridSet.atomIndices().size();
+        const int* cxy_na          = grid.cxy_na().data();
+        const int* cxy_ind         = grid.cxy_ind().data();
+
+        auto* timerH2D = bDoTime ? &gpu_nbv->timers->xf[AtomLocality::Local].nb_h2d : nullptr;
+
+        reallocateDeviceBuffer(&gpu_nbv->atomIndices,
+                               atomIndicesSize,
+                               &gpu_nbv->atomIndicesSize,
+                               &gpu_nbv->atomIndicesSize_alloc,
+                               *gpu_nbv->deviceContext_);
+
+        if (atomIndicesSize > 0)
+        {
+            if (bDoTime)
+            {
+                timerH2D->openTimingRegion(localStream);
+            }
+
+            copyToDeviceBuffer(&gpu_nbv->atomIndices,
+                               atomIndices,
+                               0,
+                               atomIndicesSize,
+                               localStream,
+                               GpuApiCallBehavior::Async,
+                               bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+            if (bDoTime)
+            {
+                timerH2D->closeTimingRegion(localStream);
+            }
+        }
+
+        if (numColumns > 0)
+        {
+            if (bDoTime)
+            {
+                timerH2D->openTimingRegion(localStream);
+            }
+
+            copyToDeviceBuffer(&gpu_nbv->cxy_na,
+                               cxy_na,
+                               maxNumColumns * g,
+                               numColumns,
+                               localStream,
+                               GpuApiCallBehavior::Async,
+                               bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+            if (bDoTime)
+            {
+                timerH2D->closeTimingRegion(localStream);
+            }
+
+            if (bDoTime)
+            {
+                timerH2D->openTimingRegion(localStream);
+            }
+
+            copyToDeviceBuffer(&gpu_nbv->cxy_ind,
+                               cxy_ind,
+                               maxNumColumns * g,
+                               numColumns,
+                               localStream,
+                               GpuApiCallBehavior::Async,
+                               bDoTime ? timerH2D->fetchNextEvent() : nullptr);
+
+            if (bDoTime)
+            {
+                timerH2D->closeTimingRegion(localStream);
+            }
+        }
+    }
+
+    // The above data is transferred on the local stream but is a
+    // dependency of the nonlocal stream (specifically the nonlocal X
+    // buf ops kernel).  We therefore set a dependency to ensure
+    // that the nonlocal stream waits on the local stream here.
+    // This call records an event in the local stream:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::Local);
+    // ...and this call instructs the nonlocal stream to wait on that event:
+    nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
+}
+
+//! This function is documented in the header file
+void gpu_free(NbnxmGpu* nb)
+{
+    if (nb == nullptr)
+    {
+        return;
+    }
+
+    gpu_free_platform_specific(nb);
+
+    delete nb->timers;
+    sfree(nb->timings);
+
+    NBAtomDataGpu* atdat   = nb->atdat;
+    NBParamGpu*    nbparam = nb->nbparam;
+
+    /* Free atdat */
+    freeDeviceBuffer(&(nb->atdat->xq));
+    freeDeviceBuffer(&(nb->atdat->f));
+    freeDeviceBuffer(&(nb->atdat->eLJ));
+    freeDeviceBuffer(&(nb->atdat->eElec));
+    freeDeviceBuffer(&(nb->atdat->fShift));
+    freeDeviceBuffer(&(nb->atdat->shiftVec));
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        freeDeviceBuffer(&atdat->ljComb);
+    }
+    else
+    {
+        freeDeviceBuffer(&atdat->atomTypes);
+    }
+
+    /* Free nbparam */
+    if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
+    {
+        destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
+    }
+
+    if (!useLjCombRule(nb->nbparam->vdwType))
+    {
+        destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
+    }
+
+    if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
+    {
+        destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
+    }
+
+    /* Free plist */
+    auto* plist = nb->plist[InteractionLocality::Local];
+    freeDeviceBuffer(&plist->sci);
+    freeDeviceBuffer(&plist->cj4);
+    freeDeviceBuffer(&plist->imask);
+    freeDeviceBuffer(&plist->excl);
+    delete plist;
+    if (nb->bUseTwoStreams)
+    {
+        auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
+        freeDeviceBuffer(&plist_nl->sci);
+        freeDeviceBuffer(&plist_nl->cj4);
+        freeDeviceBuffer(&plist_nl->imask);
+        freeDeviceBuffer(&plist_nl->excl);
+        delete plist_nl;
+    }
+
+    /* Free nbst */
+    pfree(nb->nbst.eLJ);
+    nb->nbst.eLJ = nullptr;
+
+    pfree(nb->nbst.eElec);
+    nb->nbst.eElec = nullptr;
+
+    pfree(nb->nbst.fShift);
+    nb->nbst.fShift = nullptr;
+
+    delete atdat;
+    delete nbparam;
+    delete nb;
+
+    if (debug)
+    {
+        fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
+    }
+}
+
+DeviceBuffer<gmx::RVec> gpu_get_f(NbnxmGpu* nb)
+{
+    GMX_ASSERT(nb != nullptr, "nb pointer must be valid");
+
+    return nb->atdat->f;
  }
  
  } // namespace Nbnxm