Unify handling of GMX_ENABLE_GPU_TIMING and GMX_DISABLE_GPU_TIMING

[alexxy/gromacs.git] / src / gromacs / nbnxm / nbnxm_gpu_data_mgmt.cpp
diff --git a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp

index afd969ee7f8d7c34cd1c52211993cd7ce3bb5106..2263593cb842d5e43f302c83f4584a371ad908f7 100644 (file)
--- a/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
@@ -84,7 +84,7 @@
  namespace Nbnxm
  {
  
-inline void issueClFlushInStream(const DeviceStream& deviceStream)
+static inline void issueClFlushInStream(const DeviceStream& deviceStream)
  {
  #if GMX_GPU_OPENCL
      /* Based on the v1.2 section 5.13 of the OpenCL spec, a flush is needed
@@ -101,13 +101,13 @@ inline void issueClFlushInStream(const DeviceStream& deviceStream)
  #endif
  }
  
-void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
-                                    NBParamGpu*                  nbp,
-                                    const DeviceContext&         deviceContext)
+static inline void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
+                                                  NBParamGpu*                  nbp,
+                                                  const DeviceContext&         deviceContext)
  {
      if (nbp->coulomb_tab)
      {
-        destroyParamLookupTable(&nbp->coulomb_tab, nbp->coulomb_tab_texobj);
+        destroyParamLookupTable(&nbp->coulomb_tab, &nbp->coulomb_tab_texobj);
      }
  
      nbp->coulomb_tab_scale = tables.scale;
@@ -115,8 +115,8 @@ void init_ewald_coulomb_force_table(const EwaldCorrectionTables& tables,
              &nbp->coulomb_tab, &nbp->coulomb_tab_texobj, tables.tableF.data(), tables.tableF.size(), deviceContext);
  }
  
-enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
-                                               const DeviceInformation gmx_unused& deviceInfo)
+static inline ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
+                                                        const DeviceInformation gmx_unused& deviceInfo)
  {
      bool bTwinCut = (ic.rcoulomb != ic.rvdw);
  
@@ -173,7 +173,9 @@ enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
      }
  }
  
-void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
+static inline void set_cutoff_parameters(NBParamGpu*                nbp,
+                                         const interaction_const_t& ic,
+                                         const PairlistParams&      listParams)
  {
      nbp->ewald_beta        = ic.ewaldcoeff_q;
      nbp->sh_ewald          = ic.sh_ewald;
@@ -195,24 +197,7 @@ void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const
      nbp->vdw_switch       = ic.vdw_switch;
  }
  
-void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
-{
-    if (!nbv || !nbv->useGpu())
-    {
-        return;
-    }
-    NbnxmGpu*   nb  = nbv->gpu_nbv;
-    NBParamGpu* nbp = nb->nbparam;
-
-    set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
-
-    nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
-
-    GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-    init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
-}
-
-void init_plist(gpu_plist* pl)
+static inline void init_plist(gpu_plist* pl)
  {
      /* initialize to nullptr pointers to data that is not allocated here and will
         need reallocation in nbnxn_gpu_init_pairlist */
@@ -236,7 +221,7 @@ void init_plist(gpu_plist* pl)
      pl->rollingPruningPart     = 0;
  }
  
-void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
+static inline void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
  {
      t->nb_h2d_t = 0.0;
      t->nb_d2h_t = 0.0;
@@ -258,20 +243,20 @@ void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
  }
  
  /*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
-static void initAtomdataFirst(NBAtomData*          atomdata,
-                              int                  numTypes,
-                              const DeviceContext& deviceContext,
-                              const DeviceStream&  localStream)
+static inline void initAtomdataFirst(NBAtomDataGpu*       atomdata,
+                                     int                  numTypes,
+                                     const DeviceContext& deviceContext,
+                                     const DeviceStream&  localStream)
  {
      atomdata->numTypes = numTypes;
-    allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
+    allocateDeviceBuffer(&atomdata->shiftVec, gmx::c_numShiftVectors, deviceContext);
      atomdata->shiftVecUploaded = false;
  
-    allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
+    allocateDeviceBuffer(&atomdata->fShift, gmx::c_numShiftVectors, deviceContext);
      allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
      allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
  
-    clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
+    clearDeviceBufferAsync(&atomdata->fShift, 0, gmx::c_numShiftVectors, localStream);
      clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
      clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
  
@@ -285,12 +270,91 @@ static void initAtomdataFirst(NBAtomData*          atomdata,
      atomdata->numAtomsAlloc = -1;
  }
  
+static inline VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic,
+                                                LJCombinationRule          ljCombinationRule)
+{
+    if (ic.vdwtype == VanDerWaalsType::Cut)
+    {
+        switch (ic.vdw_modifier)
+        {
+            case InteractionModifiers::None:
+            case InteractionModifiers::PotShift:
+                switch (ljCombinationRule)
+                {
+                    case LJCombinationRule::None: return VdwType::Cut;
+                    case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
+                    case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
+                    default:
+                        GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
+                                "The requested LJ combination rule %s is not implemented in "
+                                "the GPU accelerated kernels!",
+                                enumValueToString(ljCombinationRule))));
+                }
+            case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
+            case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
+            default:
+                GMX_THROW(gmx::InconsistentInputError(
+                        gmx::formatString("The requested VdW interaction modifier %s is not "
+                                          "implemented in the GPU accelerated kernels!",
+                                          enumValueToString(ic.vdw_modifier))));
+        }
+    }
+    else if (ic.vdwtype == VanDerWaalsType::Pme)
+    {
+        if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
+        {
+            GMX_RELEASE_ASSERT(
+                    ljCombinationRule == LJCombinationRule::Geometric,
+                    "Combination rules for long- and short-range interactions should match.");
+            return VdwType::EwaldGeom;
+        }
+        else
+        {
+            GMX_RELEASE_ASSERT(
+                    ljCombinationRule == LJCombinationRule::LorentzBerthelot,
+                    "Combination rules for long- and short-range interactions should match.");
+            return VdwType::EwaldLB;
+        }
+    }
+    else
+    {
+        GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
+                "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
+                enumValueToString(ic.vdwtype))));
+    }
+}
+
+static inline ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
+                                                            const DeviceInformation&   deviceInfo)
+{
+    if (ic.eeltype == CoulombInteractionType::Cut)
+    {
+        return ElecType::Cut;
+    }
+    else if (EEL_RF(ic.eeltype))
+    {
+        return ElecType::RF;
+    }
+    else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
+    {
+        return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
+    }
+    else
+    {
+        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
+        GMX_THROW(gmx::InconsistentInputError(
+                gmx::formatString("The requested electrostatics type %s is not implemented in "
+                                  "the GPU accelerated kernels!",
+                                  enumValueToString(ic.eeltype))));
+    }
+}
+
  /*! \brief Initialize the nonbonded parameter data structure. */
-static void initNbparam(NBParamGpu*                     nbp,
-                        const interaction_const_t&      ic,
-                        const PairlistParams&           listParams,
-                        const nbnxn_atomdata_t::Params& nbatParams,
-                        const DeviceContext&            deviceContext)
+static inline void initNbparam(NBParamGpu*                     nbp,
+                               const interaction_const_t&      ic,
+                               const PairlistParams&           listParams,
+                               const nbnxn_atomdata_t::Params& nbatParams,
+                               const DeviceContext&            deviceContext)
  {
      const int numTypes = nbatParams.numTypes;
  
@@ -354,7 +418,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
  {
      auto* nb                              = new NbnxmGpu();
      nb->deviceContext_                    = &deviceStreamManager.context();
-    nb->atdat                             = new NBAtomData;
+    nb->atdat                             = new NBAtomDataGpu;
      nb->nbparam                           = new NBParamGpu;
      nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
      if (bLocalAndNonlocal)
@@ -367,18 +431,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
      nb->timers = new Nbnxm::GpuTimers();
      snew(nb->timings, 1);
  
-    /* WARNING: CUDA timings are incorrect with multiple streams.
-     * This is the main reason why they are disabled by default.
-     * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
-     * TODO: Consider turning on by default when we can detect nr of streams.
-     *
-     * OpenCL timing is enabled by default and can be disabled by
-     * GMX_DISABLE_GPU_TIMING environment variable.
-     *
-     * Timing is disabled in SYCL.
-     */
-    nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
-                  || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
+    nb->bDoTime = decideGpuTimingsUsage();
  
      if (nb->bDoTime)
      {
@@ -388,7 +441,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
      /* init nbst */
      pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
      pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), gmx::c_numShiftVectors * sizeof(*nb->nbst.fShift));
  
      init_plist(nb->plist[InteractionLocality::Local]);
  
@@ -426,9 +479,26 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
      return nb;
  }
  
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
+{
+    if (!nbv || !nbv->useGpu())
+    {
+        return;
+    }
+    NbnxmGpu*   nb  = nbv->gpu_nbv;
+    NBParamGpu* nbp = nb->nbparam;
+
+    set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
+
+    nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
+
+    GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+    init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
+}
+
  void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
  {
-    NBAtomData*         adat        = nb->atdat;
+    NBAtomDataGpu*      adat        = nb->atdat;
      const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
  
      /* only if we have a dynamic box */
@@ -437,7 +507,7 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
          copyToDeviceBuffer(&adat->shiftVec,
                             gmx::asGenericFloat3Pointer(nbatom->shift_vec),
                             0,
-                           SHIFTS,
+                           gmx::c_numShiftVectors,
                             localStream,
                             GpuApiCallBehavior::Async,
                             nullptr);
@@ -532,7 +602,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  {
      bool                 bDoTime       = nb->bDoTime;
      Nbnxm::GpuTimers*    timers        = bDoTime ? nb->timers : nullptr;
-    NBAtomData*          atdat         = nb->atdat;
+    NBAtomDataGpu*       atdat         = nb->atdat;
      const DeviceContext& deviceContext = *nb->deviceContext_;
      const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
  
@@ -630,14 +700,14 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
  
  void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
  {
-    NBAtomData*         adat        = nb->atdat;
+    NBAtomDataGpu*      adat        = nb->atdat;
      const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
      // Clear forces
      clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
      // Clear shift force array and energies if the outputs were used in the current step
      if (computeVirial)
      {
-        clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
+        clearDeviceBufferAsync(&adat->fShift, 0, gmx::c_numShiftVectors, localStream);
          clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
          clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
      }
@@ -665,82 +735,9 @@ bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
              || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
  }
  
-enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
-                                                   const DeviceInformation&   deviceInfo)
-{
-    if (ic.eeltype == CoulombInteractionType::Cut)
-    {
-        return ElecType::Cut;
-    }
-    else if (EEL_RF(ic.eeltype))
-    {
-        return ElecType::RF;
-    }
-    else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
-    {
-        return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
-    }
-    else
-    {
-        /* Shouldn't happen, as this is checked when choosing Verlet-scheme */
-        GMX_THROW(gmx::InconsistentInputError(
-                gmx::formatString("The requested electrostatics type %s is not implemented in "
-                                  "the GPU accelerated kernels!",
-                                  enumValueToString(ic.eeltype))));
-    }
-}
-
-
-enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
-{
-    if (ic.vdwtype == VanDerWaalsType::Cut)
-    {
-        switch (ic.vdw_modifier)
-        {
-            case InteractionModifiers::None:
-            case InteractionModifiers::PotShift:
-                switch (ljCombinationRule)
-                {
-                    case LJCombinationRule::None: return VdwType::Cut;
-                    case LJCombinationRule::Geometric: return VdwType::CutCombGeom;
-                    case LJCombinationRule::LorentzBerthelot: return VdwType::CutCombLB;
-                    default:
-                        GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
-                                "The requested LJ combination rule %s is not implemented in "
-                                "the GPU accelerated kernels!",
-                                enumValueToString(ljCombinationRule))));
-                }
-            case InteractionModifiers::ForceSwitch: return VdwType::FSwitch;
-            case InteractionModifiers::PotSwitch: return VdwType::PSwitch;
-            default:
-                GMX_THROW(gmx::InconsistentInputError(
-                        gmx::formatString("The requested VdW interaction modifier %s is not "
-                                          "implemented in the GPU accelerated kernels!",
-                                          enumValueToString(ic.vdw_modifier))));
-        }
-    }
-    else if (ic.vdwtype == VanDerWaalsType::Pme)
-    {
-        if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
-        {
-            assert(ljCombinationRule == LJCombinationRule::Geometric);
-            return VdwType::EwaldGeom;
-        }
-        else
-        {
-            assert(ljCombinationRule == LJCombinationRule::LorentzBerthelot);
-            return VdwType::EwaldLB;
-        }
-    }
-    else
-    {
-        GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
-                "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
-                enumValueToString(ic.vdwtype))));
-    }
-}
-
-void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const gmx::InteractionLocality iLocality)
+void setupGpuShortRangeWork(NbnxmGpu*                      nb,
+                            const gmx::ListedForcesGpu*    listedForcesGpu,
+                            const gmx::InteractionLocality iLocality)
  {
      GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
  
@@ -748,7 +745,7 @@ void setupGpuShortRangeWork(NbnxmGpu* nb, const gmx::GpuBonded* gpuBonded, const
      // interaction locality contains entries or if there is any
      // bonded work (as this is not split into local/nonlocal).
      nb->haveWork[iLocality] = ((nb->plist[iLocality]->nsci != 0)
-                               || (gpuBonded != nullptr && gpuBonded->haveInteractions()));
+                               || (listedForcesGpu != nullptr && listedForcesGpu->haveInteractions()));
  }
  
  bool haveGpuShortRangeWork(const NbnxmGpu* nb, const gmx::InteractionLocality interactionLocality)
@@ -777,7 +774,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                 "beginning of the copy back function.");
  
      /* extract the data */
-    NBAtomData*         adat         = nb->atdat;
+    NBAtomDataGpu*      adat         = nb->atdat;
      Nbnxm::GpuTimers*   timers       = nb->timers;
      bool                bDoTime      = nb->bDoTime;
      const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
@@ -815,17 +812,21 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
      }
  
      /* DtoH f */
-    static_assert(sizeof(*nbatom->out[0].f.data()) == sizeof(float),
-                  "The host force buffer should be in single precision to match device data size.");
-    copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
-                         &adat->f,
-                         atomsRange.begin(),
-                         atomsRange.size(),
-                         deviceStream,
-                         GpuApiCallBehavior::Async,
-                         bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
-
-    issueClFlushInStream(deviceStream);
+    if (!stepWork.useGpuFBufferOps)
+    {
+        static_assert(
+                sizeof(*nbatom->out[0].f.data()) == sizeof(float),
+                "The host force buffer should be in single precision to match device data size.");
+        copyFromDeviceBuffer(reinterpret_cast<Float3*>(nbatom->out[0].f.data()) + atomsRange.begin(),
+                             &adat->f,
+                             atomsRange.begin(),
+                             atomsRange.size(),
+                             deviceStream,
+                             GpuApiCallBehavior::Async,
+                             bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
+
+        issueClFlushInStream(deviceStream);
+    }
  
      /* After the non-local D2H is launched the nonlocal_done event can be
         recorded which signals that the local D2H can proceed. This event is not
@@ -849,7 +850,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
              copyFromDeviceBuffer(nb->nbst.fShift,
                                   &adat->fShift,
                                   0,
-                                 SHIFTS,
+                                 gmx::c_numShiftVectors,
                                   deviceStream,
                                   GpuApiCallBehavior::Async,
                                   bDoTime ? timers->xf[atomLocality].nb_d2h.fetchNextEvent() : nullptr);
@@ -917,7 +918,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
  
      const InteractionLocality iloc = atomToInteractionLocality(atomLocality);
  
-    NBAtomData*         adat         = nb->atdat;
+    NBAtomDataGpu*      adat         = nb->atdat;
      gpu_plist*          plist        = nb->plist[iloc];
      Nbnxm::GpuTimers*   timers       = nb->timers;
      const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
@@ -1087,4 +1088,96 @@ void nbnxn_gpu_init_x_to_nbat_x(const Nbnxm::GridSet& gridSet, NbnxmGpu* gpu_nbv
      nbnxnInsertNonlocalGpuDependency(gpu_nbv, Nbnxm::InteractionLocality::NonLocal);
  }
  
+//! This function is documented in the header file
+void gpu_free(NbnxmGpu* nb)
+{
+    if (nb == nullptr)
+    {
+        return;
+    }
+
+    gpu_free_platform_specific(nb);
+
+    delete nb->timers;
+    sfree(nb->timings);
+
+    NBAtomDataGpu* atdat   = nb->atdat;
+    NBParamGpu*    nbparam = nb->nbparam;
+
+    /* Free atdat */
+    freeDeviceBuffer(&(nb->atdat->xq));
+    freeDeviceBuffer(&(nb->atdat->f));
+    freeDeviceBuffer(&(nb->atdat->eLJ));
+    freeDeviceBuffer(&(nb->atdat->eElec));
+    freeDeviceBuffer(&(nb->atdat->fShift));
+    freeDeviceBuffer(&(nb->atdat->shiftVec));
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        freeDeviceBuffer(&atdat->ljComb);
+    }
+    else
+    {
+        freeDeviceBuffer(&atdat->atomTypes);
+    }
+
+    /* Free nbparam */
+    if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
+    {
+        destroyParamLookupTable(&nbparam->coulomb_tab, &nbparam->coulomb_tab_texobj);
+    }
+
+    if (!useLjCombRule(nb->nbparam->vdwType))
+    {
+        destroyParamLookupTable(&nbparam->nbfp, &nbparam->nbfp_texobj);
+    }
+
+    if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
+    {
+        destroyParamLookupTable(&nbparam->nbfp_comb, &nbparam->nbfp_comb_texobj);
+    }
+
+    /* Free plist */
+    auto* plist = nb->plist[InteractionLocality::Local];
+    freeDeviceBuffer(&plist->sci);
+    freeDeviceBuffer(&plist->cj4);
+    freeDeviceBuffer(&plist->imask);
+    freeDeviceBuffer(&plist->excl);
+    delete plist;
+    if (nb->bUseTwoStreams)
+    {
+        auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
+        freeDeviceBuffer(&plist_nl->sci);
+        freeDeviceBuffer(&plist_nl->cj4);
+        freeDeviceBuffer(&plist_nl->imask);
+        freeDeviceBuffer(&plist_nl->excl);
+        delete plist_nl;
+    }
+
+    /* Free nbst */
+    pfree(nb->nbst.eLJ);
+    nb->nbst.eLJ = nullptr;
+
+    pfree(nb->nbst.eElec);
+    nb->nbst.eElec = nullptr;
+
+    pfree(nb->nbst.fShift);
+    nb->nbst.fShift = nullptr;
+
+    delete atdat;
+    delete nbparam;
+    delete nb;
+
+    if (debug)
+    {
+        fprintf(debug, "Cleaned up NBNXM GPU data structures.\n");
+    }
+}
+
+DeviceBuffer<gmx::RVec> gpu_get_f(NbnxmGpu* nb)
+{
+    GMX_ASSERT(nb != nullptr, "nb pointer must be valid");
+
+    return nb->atdat->f;
+}
+
  } // namespace Nbnxm