Unify init_gpu function in NBNXM
authorArtem Zhmurov <zhmurov@gmail.com>
Fri, 19 Mar 2021 16:48:38 +0000 (16:48 +0000)
committerPaul Bauer <paul.bauer.q@gmail.com>
Fri, 19 Mar 2021 16:48:38 +0000 (16:48 +0000)
Refs. #2608

src/gromacs/ewald/pme_load_balancing.cpp
src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
src/gromacs/nbnxm/gpu_data_mgmt.h
src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.cpp
src/gromacs/nbnxm/nbnxm_gpu_data_mgmt.h
src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp

index b7c3879d914c62150c39ff6510e2f3f7f8460d52..19778b8999b897ac9ea29cbbcb454a5f2a5548c2 100644 (file)
@@ -856,7 +856,7 @@ static void pme_load_balance(pme_load_balancing_t*          pme_lb,
     /* We always re-initialize the tables whether they are used or not */
     init_interaction_const_tables(nullptr, ic, set->rlistOuter, ir.tabext);
 
-    Nbnxm::gpu_pme_loadbal_update_param(nbv, ic);
+    Nbnxm::gpu_pme_loadbal_update_param(nbv, *ic);
 
     if (!pme_lb->bSepPMERanks)
     {
index 804a8ea18066e61b104872e660be1ef056a95530..274f40448f256004dfe08cf223f67c70f9db78c3 100644 (file)
@@ -52,7 +52,6 @@
 // TODO Remove this comment when the above order issue is resolved
 #include "gromacs/gpu_utils/cudautils.cuh"
 #include "gromacs/gpu_utils/device_context.h"
-#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/gpu_utils.h"
 #include "gromacs/gpu_utils/gpueventsynchronizer.cuh"
 #include "gromacs/gpu_utils/pmalloc.h"
@@ -92,174 +91,11 @@ namespace Nbnxm
  */
 static unsigned int gpu_min_ci_balanced_factor = 44;
 
-/*! Initializes the atomdata structure first time, it only gets filled at
-    pair-search. */
-static void init_atomdata_first(NBAtomData*          ad,
-                                int                  nTypes,
-                                const DeviceContext& deviceContext,
-                                const DeviceStream&  localStream)
+void gpu_init_platform_specific(NbnxmGpu* /* nb */)
 {
-    ad->numTypes = nTypes;
-    allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
-    ad->shiftVecUploaded = false;
-
-    allocateDeviceBuffer(&ad->fShift, SHIFTS, deviceContext);
-    allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
-    allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
-
-    clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
-    clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
-    clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
-
-    /* initialize to nullptr poiters to data that is not allocated here and will
-       need reallocation in nbnxn_cuda_init_atomdata */
-    ad->xq = nullptr;
-    ad->f  = nullptr;
-
-    /* size -1 indicates that the respective array hasn't been initialized yet */
-    ad->numAtoms      = -1;
-    ad->numAtomsAlloc = -1;
-}
-
-/*! Initializes the nonbonded parameter data structure. */
-static void init_nbparam(NBParamGpu*                     nbp,
-                         const interaction_const_t*      ic,
-                         const PairlistParams&           listParams,
-                         const nbnxn_atomdata_t::Params& nbatParams,
-                         const DeviceContext&            deviceContext)
-{
-    const int ntypes = nbatParams.numTypes;
-
-    set_cutoff_parameters(nbp, ic, listParams);
-
-    /* The kernel code supports LJ combination rules (geometric and LB) for
-     * all kernel types, but we only generate useful combination rule kernels.
-     * We currently only use LJ combination rule (geometric and LB) kernels
-     * for plain cut-off LJ. On Maxwell the force only kernels speed up 15%
-     * with PME and 20% with RF, the other kernels speed up about half as much.
-     * For LJ force-switch the geometric rule would give 7% speed-up, but this
-     * combination is rarely used. LJ force-switch with LB rule is more common,
-     * but gives only 1% speed-up.
-     */
-    nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
-    nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
-
-    /* generate table for PME */
-    nbp->coulomb_tab = nullptr;
-    if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
-    {
-        GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-        init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, deviceContext);
-    }
-
-    /* set up LJ parameter lookup table */
-    if (!useLjCombRule(nbp->vdwType))
-    {
-        static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
-                      "Mismatch in the size of host / device data types");
-        initParamLookupTable(&nbp->nbfp,
-                             &nbp->nbfp_texobj,
-                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
-                             ntypes * ntypes,
-                             deviceContext);
-    }
-
-    /* set up LJ-PME parameter lookup table */
-    if (ic->vdwtype == VanDerWaalsType::Pme)
-    {
-        static_assert(sizeof(decltype(nbp->nbfp_comb))
-                              == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
-                      "Mismatch in the size of host / device data types");
-        initParamLookupTable(&nbp->nbfp_comb,
-                             &nbp->nbfp_comb_texobj,
-                             reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
-                             ntypes,
-                             deviceContext);
-    }
-}
-
-NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
-                   const interaction_const_t*      ic,
-                   const PairlistParams&           listParams,
-                   const nbnxn_atomdata_t*         nbat,
-                   bool                            bLocalAndNonlocal)
-{
-    auto nb            = new NbnxmGpu();
-    nb->deviceContext_ = &deviceStreamManager.context();
-    snew(nb->atdat, 1);
-    snew(nb->nbparam, 1);
-    snew(nb->plist[InteractionLocality::Local], 1);
-    if (bLocalAndNonlocal)
-    {
-        snew(nb->plist[InteractionLocality::NonLocal], 1);
-    }
-
-    nb->bUseTwoStreams = bLocalAndNonlocal;
-
-    nb->timers = new Nbnxm::GpuTimers();
-    snew(nb->timings, 1);
-
-    /* init nbst */
-    pmalloc((void**)&nb->nbst.eLJ, sizeof(*nb->nbst.eLJ));
-    pmalloc((void**)&nb->nbst.eElec, sizeof(*nb->nbst.eElec));
-    pmalloc((void**)&nb->nbst.fShift, SHIFTS * sizeof(*nb->nbst.fShift));
-
-    init_plist(nb->plist[InteractionLocality::Local]);
-
-    /* local/non-local GPU streams */
-    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
-                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
-    const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
-    nb->deviceStreams[InteractionLocality::Local] = &localStream;
-    if (nb->bUseTwoStreams)
-    {
-        init_plist(nb->plist[InteractionLocality::NonLocal]);
-
-        /* Note that the device we're running on does not have to support
-         * priorities, because we are querying the priority range which in this
-         * case will be a single value.
-         */
-        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
-                           "Non-local non-bonded stream should be initialized to use GPU for "
-                           "non-bonded with domain decomposition.");
-        nb->deviceStreams[InteractionLocality::NonLocal] =
-                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
-        ;
-    }
-
-    /* WARNING: CUDA timings are incorrect with multiple streams.
-     *          This is the main reason why they are disabled by default.
-     */
-    // TODO: Consider turning on by default when we can detect nr of streams.
-    nb->bDoTime = (getenv("GMX_ENABLE_GPU_TIMING") != nullptr);
-
-    if (nb->bDoTime)
-    {
-        init_timings(nb->timings);
-    }
-
     /* set the kernel type for the current GPU */
     /* pick L1 cache configuration */
     cuda_set_cacheconfig();
-
-    const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
-    const DeviceContext&            deviceContext = *nb->deviceContext_;
-    init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
-    init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
-
-    nb->atomIndicesSize       = 0;
-    nb->atomIndicesSize_alloc = 0;
-    nb->ncxy_na               = 0;
-    nb->ncxy_na_alloc         = 0;
-    nb->ncxy_ind              = 0;
-    nb->ncxy_ind_alloc        = 0;
-
-    if (debug)
-    {
-        fprintf(debug, "Initialized CUDA data structures.\n");
-    }
-
-    return nb;
 }
 
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
@@ -290,17 +126,17 @@ void gpu_free(NbnxmGpu* nb)
         return;
     }
 
+    delete nb->timers;
+    sfree(nb->timings);
+
     NBAtomData* atdat   = nb->atdat;
     NBParamGpu* nbparam = nb->nbparam;
 
-    if ((!nbparam->coulomb_tab)
-        && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
+    if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
     {
         destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
     }
 
-    delete nb->timers;
-
     if (!useLjCombRule(nb->nbparam->vdwType))
     {
         destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
@@ -319,8 +155,14 @@ void gpu_free(NbnxmGpu* nb)
 
     freeDeviceBuffer(&atdat->f);
     freeDeviceBuffer(&atdat->xq);
-    freeDeviceBuffer(&atdat->atomTypes);
-    freeDeviceBuffer(&atdat->ljComb);
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        freeDeviceBuffer(&atdat->ljComb);
+    }
+    else
+    {
+        freeDeviceBuffer(&atdat->atomTypes);
+    }
 
     /* Free plist */
     auto* plist = nb->plist[InteractionLocality::Local];
@@ -328,7 +170,7 @@ void gpu_free(NbnxmGpu* nb)
     freeDeviceBuffer(&plist->cj4);
     freeDeviceBuffer(&plist->imask);
     freeDeviceBuffer(&plist->excl);
-    sfree(plist);
+    delete plist;
     if (nb->bUseTwoStreams)
     {
         auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
@@ -336,7 +178,7 @@ void gpu_free(NbnxmGpu* nb)
         freeDeviceBuffer(&plist_nl->cj4);
         freeDeviceBuffer(&plist_nl->imask);
         freeDeviceBuffer(&plist_nl->excl);
-        sfree(plist_nl);
+        delete plist_nl;
     }
 
     /* Free nbst */
@@ -349,9 +191,8 @@ void gpu_free(NbnxmGpu* nb)
     pfree(nb->nbst.fShift);
     nb->nbst.fShift = nullptr;
 
-    sfree(atdat);
-    sfree(nbparam);
-    sfree(nb->timings);
+    delete atdat;
+    delete nbparam;
     delete nb;
 
     if (debug)
index 5b0c22085c33ca07bab3099db9e09cf5c7826bb0..d9f81f1d8deb0a27080b61d5518a844fc63a35a3 100644 (file)
@@ -95,7 +95,7 @@ void gpu_init_atomdata(NbnxmGpu gmx_unused* nb, const nbnxn_atomdata_t gmx_unuse
  */
 GPU_FUNC_QUALIFIER
 void gpu_pme_loadbal_update_param(const struct nonbonded_verlet_t gmx_unused* nbv,
-                                  const interaction_const_t gmx_unused* ic) GPU_FUNC_TERM;
+                                  const interaction_const_t gmx_unused& ic) GPU_FUNC_TERM;
 
 /** Uploads shift vector to the GPU if the box is dynamic (otherwise just returns). */
 GPU_FUNC_QUALIFIER
@@ -129,13 +129,13 @@ bool gpu_is_kernel_ewald_analytical(const NbnxmGpu gmx_unused* nb) GPU_FUNC_TERM
 
 /** Return the enum value of electrostatics kernel type for given interaction parameters \p ic. */
 GPU_FUNC_QUALIFIER
-enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t gmx_unused* ic,
+enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t gmx_unused& ic,
                                                    const DeviceInformation gmx_unused& deviceInfo)
         GPU_FUNC_TERM_WITH_RETURN(ElecType::Count);
 
 /** Return the enum value of VdW kernel type for given \p ic and \p combRule. */
 GPU_FUNC_QUALIFIER
-enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t gmx_unused* ic,
+enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t gmx_unused& ic,
                                        LJCombinationRule gmx_unused ljCombinationRule)
         GPU_FUNC_TERM_WITH_RETURN(VdwType::Count);
 
index b86b785b94238e764c042adad5dfbf4e459dc30a..c4efe8d4589157791ac89e9ab19098f4300472ad 100644 (file)
@@ -62,6 +62,8 @@
 
 #include "nbnxm_gpu_data_mgmt.h"
 
+#include "gromacs/gpu_utils/device_stream_manager.h"
+#include "gromacs/gpu_utils/pmalloc.h"
 #include "gromacs/hardware/device_information.h"
 #include "gromacs/mdtypes/interaction_const.h"
 #include "gromacs/mdtypes/simulation_workload.h"
@@ -169,29 +171,29 @@ enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t& ic,
     }
 }
 
-void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams)
+void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams)
 {
-    nbp->ewald_beta        = ic->ewaldcoeff_q;
-    nbp->sh_ewald          = ic->sh_ewald;
-    nbp->epsfac            = ic->epsfac;
-    nbp->two_k_rf          = 2.0 * ic->reactionFieldCoefficient;
-    nbp->c_rf              = ic->reactionFieldShift;
-    nbp->rvdw_sq           = ic->rvdw * ic->rvdw;
-    nbp->rcoulomb_sq       = ic->rcoulomb * ic->rcoulomb;
+    nbp->ewald_beta        = ic.ewaldcoeff_q;
+    nbp->sh_ewald          = ic.sh_ewald;
+    nbp->epsfac            = ic.epsfac;
+    nbp->two_k_rf          = 2.0 * ic.reactionFieldCoefficient;
+    nbp->c_rf              = ic.reactionFieldShift;
+    nbp->rvdw_sq           = ic.rvdw * ic.rvdw;
+    nbp->rcoulomb_sq       = ic.rcoulomb * ic.rcoulomb;
     nbp->rlistOuter_sq     = listParams.rlistOuter * listParams.rlistOuter;
     nbp->rlistInner_sq     = listParams.rlistInner * listParams.rlistInner;
     nbp->useDynamicPruning = listParams.useDynamicPruning;
 
-    nbp->sh_lj_ewald   = ic->sh_lj_ewald;
-    nbp->ewaldcoeff_lj = ic->ewaldcoeff_lj;
+    nbp->sh_lj_ewald   = ic.sh_lj_ewald;
+    nbp->ewaldcoeff_lj = ic.ewaldcoeff_lj;
 
-    nbp->rvdw_switch      = ic->rvdw_switch;
-    nbp->dispersion_shift = ic->dispersion_shift;
-    nbp->repulsion_shift  = ic->repulsion_shift;
-    nbp->vdw_switch       = ic->vdw_switch;
+    nbp->rvdw_switch      = ic.rvdw_switch;
+    nbp->dispersion_shift = ic.dispersion_shift;
+    nbp->repulsion_shift  = ic.repulsion_shift;
+    nbp->vdw_switch       = ic.vdw_switch;
 }
 
-void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t* ic)
+void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interaction_const_t& ic)
 {
     if (!nbv || !nbv->useGpu())
     {
@@ -202,10 +204,10 @@ void gpu_pme_loadbal_update_param(const nonbonded_verlet_t* nbv, const interacti
 
     set_cutoff_parameters(nbp, ic, nbv->pairlistSets().params());
 
-    nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(*ic, nb->deviceContext_->deviceInfo());
+    nbp->elecType = nbnxn_gpu_pick_ewald_kernel_type(ic, nb->deviceContext_->deviceInfo());
 
-    GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-    init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, *nb->deviceContext_);
+    GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+    init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, *nb->deviceContext_);
 }
 
 void init_plist(gpu_plist* pl)
@@ -253,6 +255,191 @@ void init_timings(gmx_wallclock_gpu_nbnxn_t* t)
     t->dynamicPruneTime.t = 0.0;
 }
 
+/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
+static void initAtomdataFirst(NBAtomData*          atomdata,
+                              int                  numTypes,
+                              const DeviceContext& deviceContext,
+                              const DeviceStream&  localStream)
+{
+    atomdata->numTypes = numTypes;
+    allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
+    atomdata->shiftVecUploaded = false;
+
+    allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
+    allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
+    allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
+
+    clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
+    clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
+    clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
+
+    /* initialize to nullptr pointers to data that is not allocated here and will
+       need reallocation in later */
+    atomdata->xq = nullptr;
+    atomdata->f  = nullptr;
+
+    /* size -1 indicates that the respective array hasn't been initialized yet */
+    atomdata->numAtoms      = -1;
+    atomdata->numAtomsAlloc = -1;
+}
+
+/*! \brief Initialize the nonbonded parameter data structure. */
+static void initNbparam(NBParamGpu*                     nbp,
+                        const interaction_const_t&      ic,
+                        const PairlistParams&           listParams,
+                        const nbnxn_atomdata_t::Params& nbatParams,
+                        const DeviceContext&            deviceContext)
+{
+    const int numTypes = nbatParams.numTypes;
+
+    set_cutoff_parameters(nbp, ic, listParams);
+
+    nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
+    nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
+
+    if (ic.vdwtype == VanDerWaalsType::Pme)
+    {
+        if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
+        {
+            GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
+                       "Combination rule mismatch!");
+        }
+        else
+        {
+            GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
+                       "Combination rule mismatch!");
+        }
+    }
+
+    /* generate table for PME */
+    if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
+    {
+        GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
+        init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
+    }
+    else
+    {
+        // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
+        allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
+    }
+
+    /* set up LJ parameter lookup table */
+    if (!useLjCombRule(nbp->vdwType))
+    {
+        static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp,
+                             &nbp->nbfp_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
+                             numTypes * numTypes,
+                             deviceContext);
+    }
+    else
+    {
+        // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
+        allocateDeviceBuffer(&nbp->nbfp, 1, deviceContext);
+    }
+
+    /* set up LJ-PME parameter lookup table */
+    if (ic.vdwtype == VanDerWaalsType::Pme)
+    {
+        static_assert(sizeof(decltype(nbp->nbfp_comb))
+                              == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
+                      "Mismatch in the size of host / device data types");
+        initParamLookupTable(&nbp->nbfp_comb,
+                             &nbp->nbfp_comb_texobj,
+                             reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
+                             numTypes,
+                             deviceContext);
+    }
+    else
+    {
+        // Need to initialize for OpenCL, since it is unconditionally used as a kernel argument.
+        allocateDeviceBuffer(&nbp->nbfp_comb, 1, deviceContext);
+    }
+}
+
+NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
+                   const interaction_const_t*      ic,
+                   const PairlistParams&           listParams,
+                   const nbnxn_atomdata_t*         nbat,
+                   const bool                      bLocalAndNonlocal)
+{
+    auto* nb                              = new NbnxmGpu();
+    nb->deviceContext_                    = &deviceStreamManager.context();
+    nb->atdat                             = new NBAtomData;
+    nb->nbparam                           = new NBParamGpu;
+    nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
+    if (bLocalAndNonlocal)
+    {
+        nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
+    }
+
+    nb->bUseTwoStreams = bLocalAndNonlocal;
+
+    GMX_ASSERT(!(GMX_GPU_SYCL && !nb->bDoTime), "GPU timing is not supported in SYCL");
+    nb->timers = new Nbnxm::GpuTimers();
+    snew(nb->timings, 1);
+
+    /* WARNING: CUDA timings are incorrect with multiple streams.
+     * This is the main reason why they are disabled by default.
+     * Can be enabled by setting GMX_ENABLE_GPU_TIMING environment variable.
+     * TODO: Consider turning on by default when we can detect nr of streams.
+     *
+     * OpenCL timing is enabled by default and can be disabled by
+     * GMX_DISABLE_GPU_TIMING environment variable.
+     *
+     * Timing is disabled in SYCL.
+     */
+    nb->bDoTime = (GMX_GPU_CUDA && (getenv("GMX_ENABLE_GPU_TIMING") != nullptr))
+                  || (GMX_GPU_OPENCL && (getenv("GMX_DISABLE_GPU_TIMING") == nullptr));
+
+    if (nb->bDoTime)
+    {
+        init_timings(nb->timings);
+    }
+
+    /* init nbst */
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
+
+    init_plist(nb->plist[InteractionLocality::Local]);
+
+    /* local/non-local GPU streams */
+    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
+                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
+    const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
+    nb->deviceStreams[InteractionLocality::Local] = &localStream;
+    // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
+    // out-of-order. But for the time being, it will be less disruptive to keep them.
+    if (nb->bUseTwoStreams)
+    {
+        init_plist(nb->plist[InteractionLocality::NonLocal]);
+
+        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
+                           "Non-local non-bonded stream should be initialized to use GPU for "
+                           "non-bonded with domain decomposition.");
+        nb->deviceStreams[InteractionLocality::NonLocal] =
+                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
+    }
+
+    const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
+    const DeviceContext&            deviceContext = *nb->deviceContext_;
+
+    initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
+    initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
+
+    gpu_init_platform_specific(nb);
+
+    if (debug)
+    {
+        fprintf(debug, "Initialized NBNXM GPU data structures.\n");
+    }
+
+    return nb;
+}
+
 //! This function is documented in the header file
 void gpu_init_pairlist(NbnxmGpu* nb, const NbnxnPairlistGpu* h_plist, const InteractionLocality iloc)
 {
@@ -364,8 +551,14 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
         {
             freeDeviceBuffer(&atdat->f);
             freeDeviceBuffer(&atdat->xq);
-            freeDeviceBuffer(&atdat->ljComb);
-            freeDeviceBuffer(&atdat->atomTypes);
+            if (useLjCombRule(nb->nbparam->vdwType))
+            {
+                freeDeviceBuffer(&atdat->ljComb);
+            }
+            else
+            {
+                freeDeviceBuffer(&atdat->atomTypes);
+            }
         }
 
 
@@ -467,20 +660,20 @@ bool gpu_is_kernel_ewald_analytical(const NbnxmGpu* nb)
             || (nb->nbparam->elecType == ElecType::EwaldAnaTwin));
 }
 
-enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic,
+enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t& ic,
                                                    const DeviceInformation&   deviceInfo)
 {
-    if (ic->eeltype == CoulombInteractionType::Cut)
+    if (ic.eeltype == CoulombInteractionType::Cut)
     {
         return ElecType::Cut;
     }
-    else if (EEL_RF(ic->eeltype))
+    else if (EEL_RF(ic.eeltype))
     {
         return ElecType::RF;
     }
-    else if ((EEL_PME(ic->eeltype) || ic->eeltype == CoulombInteractionType::Ewald))
+    else if ((EEL_PME(ic.eeltype) || ic.eeltype == CoulombInteractionType::Ewald))
     {
-        return nbnxn_gpu_pick_ewald_kernel_type(*ic, deviceInfo);
+        return nbnxn_gpu_pick_ewald_kernel_type(ic, deviceInfo);
     }
     else
     {
@@ -488,16 +681,16 @@ enum ElecType nbnxmGpuPickElectrostaticsKernelType(const interaction_const_t* ic
         GMX_THROW(gmx::InconsistentInputError(
                 gmx::formatString("The requested electrostatics type %s is not implemented in "
                                   "the GPU accelerated kernels!",
-                                  enumValueToString(ic->eeltype))));
+                                  enumValueToString(ic.eeltype))));
     }
 }
 
 
-enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinationRule ljCombinationRule)
+enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t& ic, LJCombinationRule ljCombinationRule)
 {
-    if (ic->vdwtype == VanDerWaalsType::Cut)
+    if (ic.vdwtype == VanDerWaalsType::Cut)
     {
-        switch (ic->vdw_modifier)
+        switch (ic.vdw_modifier)
         {
             case InteractionModifiers::None:
             case InteractionModifiers::PotShift:
@@ -518,12 +711,12 @@ enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinat
                 GMX_THROW(gmx::InconsistentInputError(
                         gmx::formatString("The requested VdW interaction modifier %s is not "
                                           "implemented in the GPU accelerated kernels!",
-                                          enumValueToString(ic->vdw_modifier))));
+                                          enumValueToString(ic.vdw_modifier))));
         }
     }
-    else if (ic->vdwtype == VanDerWaalsType::Pme)
+    else if (ic.vdwtype == VanDerWaalsType::Pme)
     {
-        if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
+        if (ic.ljpme_comb_rule == LongRangeVdW::Geom)
         {
             assert(ljCombinationRule == LJCombinationRule::Geometric);
             return VdwType::EwaldGeom;
@@ -538,7 +731,7 @@ enum VdwType nbnxmGpuPickVdwKernelType(const interaction_const_t* ic, LJCombinat
     {
         GMX_THROW(gmx::InconsistentInputError(gmx::formatString(
                 "The requested VdW type %s is not implemented in the GPU accelerated kernels!",
-                enumValueToString(ic->vdwtype))));
+                enumValueToString(ic.vdwtype))));
     }
 }
 
index 36efd356b9ce45d7dd0e15822c50acb28c25c971..333b059a63001b3750e0ff20f74d94822deedb32 100644 (file)
@@ -44,6 +44,7 @@
 #ifndef GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H
 #define GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H
 
+class DeviceContext;
 struct interaction_const_t;
 struct NBParamGpu;
 struct PairlistParams;
@@ -73,7 +74,7 @@ enum ElecType nbnxn_gpu_pick_ewald_kernel_type(const interaction_const_t gmx_unu
 
 /*! \brief Copies all parameters related to the cut-off from ic to nbp
  */
-void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t* ic, const PairlistParams& listParams);
+void set_cutoff_parameters(NBParamGpu* nbp, const interaction_const_t& ic, const PairlistParams& listParams);
 
 /*! \brief Initializes the pair list data structure.
  */
@@ -82,6 +83,8 @@ void init_plist(gpu_plist* pl);
 /*! \brief Initializes the timings data structure. */
 void init_timings(gmx_wallclock_gpu_nbnxn_t* t);
 
+void gpu_init_platform_specific(NbnxmGpu* nb);
+
 } // namespace Nbnxm
 
 #endif // GMX_NBNXM_NBNXM_GPU_DATA_MGMT_H
index f666910d12425c9990b48a57233204c72d2d6fdb..08da7ba983c871e61f0f094482461e329faef38f 100644 (file)
@@ -52,7 +52,6 @@
 
 #include <cmath>
 
-#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/pmalloc.h"
 #include "gromacs/hardware/device_information.h"
 #include "gromacs/hardware/device_management.h"
@@ -99,104 +98,6 @@ namespace Nbnxm
  */
 static unsigned int gpu_min_ci_balanced_factor = 50;
 
-
-/*! \brief Initializes the atomdata structure first time, it only gets filled at
-    pair-search.
- */
-static void init_atomdata_first(NBAtomData*          ad,
-                                int                  ntypes,
-                                const DeviceContext& deviceContext,
-                                const DeviceStream&  localStream)
-{
-    ad->numTypes = ntypes;
-
-    allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
-    ad->shiftVecUploaded = false;
-
-    allocateDeviceBuffer(&ad->fShift, SHIFTS, deviceContext);
-    allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
-    allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
-
-    clearDeviceBufferAsync(&ad->fShift, 0, SHIFTS, localStream);
-    clearDeviceBufferAsync(&ad->eElec, 0, 1, localStream);
-    clearDeviceBufferAsync(&ad->eLJ, 0, 1, localStream);
-
-    /* initialize to nullptr pointers to data that is not allocated here and will
-       need reallocation in nbnxn_gpu_init_atomdata */
-    ad->xq = nullptr;
-    ad->f  = nullptr;
-
-    /* size -1 indicates that the respective array hasn't been initialized yet */
-    ad->numAtoms      = -1;
-    ad->numAtomsAlloc = -1;
-}
-
-
-/*! \brief Initializes the nonbonded parameter data structure.
- */
-static void init_nbparam(NBParamGpu*                     nbp,
-                         const interaction_const_t*      ic,
-                         const PairlistParams&           listParams,
-                         const nbnxn_atomdata_t::Params& nbatParams,
-                         const DeviceContext&            deviceContext)
-{
-    set_cutoff_parameters(nbp, ic, listParams);
-
-    nbp->vdwType  = nbnxmGpuPickVdwKernelType(ic, nbatParams.ljCombinationRule);
-    nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(ic, deviceContext.deviceInfo());
-
-    if (ic->vdwtype == VanDerWaalsType::Pme)
-    {
-        if (ic->ljpme_comb_rule == LongRangeVdW::Geom)
-        {
-            GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::Geometric,
-                       "Combination rule mismatch!");
-        }
-        else
-        {
-            GMX_ASSERT(nbatParams.ljCombinationRule == LJCombinationRule::LorentzBerthelot,
-                       "Combination rule mismatch!");
-        }
-    }
-    /* generate table for PME */
-    nbp->coulomb_tab = nullptr;
-    if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
-    {
-        GMX_RELEASE_ASSERT(ic->coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-        init_ewald_coulomb_force_table(*ic->coulombEwaldTables, nbp, deviceContext);
-    }
-    else
-    {
-        allocateDeviceBuffer(&nbp->coulomb_tab, 1, deviceContext);
-    }
-
-    {
-        /* set up LJ parameter lookup table */
-        static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
-                      "Mismatch in the size of host / device data types");
-        DeviceBuffer<Float2> nbfp;
-        initParamLookupTable(&nbfp,
-                             nullptr,
-                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
-                             nbatParams.numTypes * nbatParams.numTypes,
-                             deviceContext);
-        nbp->nbfp = nbfp;
-
-        if (ic->vdwtype == VanDerWaalsType::Pme)
-        {
-            static_assert(sizeof(Float2) == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
-                          "Mismatch in the size of host / device data types");
-            DeviceBuffer<Float2> nbfp_comb;
-            initParamLookupTable(&nbfp_comb,
-                                 nullptr,
-                                 reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
-                                 nbatParams.numTypes,
-                                 deviceContext);
-            nbp->nbfp_comb = nbfp_comb;
-        }
-    }
-}
-
 /*! \brief Initializes the OpenCL kernel pointers of the nbnxn_ocl_ptr_t input data structure. */
 static cl_kernel nbnxn_gpu_create_kernel(NbnxmGpu* nb, const char* kernel_name)
 {
@@ -238,70 +139,11 @@ static void nbnxn_gpu_init_kernels(NbnxmGpu* nb)
             nbnxn_gpu_create_kernel(nb, "nbnxn_kernel_prune_rolling_opencl");
 }
 
-//! This function is documented in the header file
-NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
-                   const interaction_const_t*      ic,
-                   const PairlistParams&           listParams,
-                   const nbnxn_atomdata_t*         nbat,
-                   const bool                      bLocalAndNonlocal)
+void gpu_init_platform_specific(NbnxmGpu* nb)
 {
-    GMX_ASSERT(ic, "Need a valid interaction constants object");
-
-    auto nb            = new NbnxmGpu();
-    nb->deviceContext_ = &deviceStreamManager.context();
-    snew(nb->atdat, 1);
-    snew(nb->nbparam, 1);
-    snew(nb->plist[InteractionLocality::Local], 1);
-    if (bLocalAndNonlocal)
-    {
-        snew(nb->plist[InteractionLocality::NonLocal], 1);
-    }
-
-    nb->bUseTwoStreams = bLocalAndNonlocal;
-
-    nb->timers = new Nbnxm::GpuTimers();
-    snew(nb->timings, 1);
-
     /* set device info, just point it to the right GPU among the detected ones */
     nb->dev_rundata = new gmx_device_runtime_data_t();
 
-    /* init nbst */
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
-
-    init_plist(nb->plist[InteractionLocality::Local]);
-
-    /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */
-    nb->bDoTime = (getenv("GMX_DISABLE_GPU_TIMING") == nullptr);
-
-    /* local/non-local GPU streams */
-    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
-                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
-    const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
-    nb->deviceStreams[InteractionLocality::Local] = &localStream;
-
-    if (nb->bUseTwoStreams)
-    {
-        init_plist(nb->plist[InteractionLocality::NonLocal]);
-
-        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
-                           "Non-local non-bonded stream should be initialized to use GPU for "
-                           "non-bonded with domain decomposition.");
-        nb->deviceStreams[InteractionLocality::NonLocal] =
-                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
-    }
-
-    if (nb->bDoTime)
-    {
-        init_timings(nb->timings);
-    }
-
-    const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
-    const DeviceContext&            deviceContext = *nb->deviceContext_;
-    init_atomdata_first(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
-    init_nbparam(nb->nbparam, ic, listParams, nbatParams, deviceContext);
-
     /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var.
      * TODO: decide about NVIDIA
      */
@@ -316,13 +158,6 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
      */
     nbnxn_gpu_compile_kernels(nb);
     nbnxn_gpu_init_kernels(nb);
-
-    if (debug)
-    {
-        fprintf(debug, "Initialized OpenCL data structures.\n");
-    }
-
-    return nb;
 }
 
 //! This function is documented in the header file
@@ -401,6 +236,12 @@ void gpu_free(NbnxmGpu* nb)
         return;
     }
 
+    delete nb->timers;
+    sfree(nb->timings);
+
+    NBAtomData* atdat   = nb->atdat;
+    NBParamGpu* nbparam = nb->nbparam;
+
     /* Free kernels */
     // NOLINTNEXTLINE(bugprone-sizeof-expression)
     int kernel_count = sizeof(nb->kernel_ener_noprune_ptr) / sizeof(nb->kernel_ener_noprune_ptr[0][0]);
@@ -424,16 +265,31 @@ void gpu_free(NbnxmGpu* nb)
     freeDeviceBuffer(&(nb->atdat->eLJ));
     freeDeviceBuffer(&(nb->atdat->eElec));
     freeDeviceBuffer(&(nb->atdat->fShift));
-    freeDeviceBuffer(&(nb->atdat->ljComb));
-    freeDeviceBuffer(&(nb->atdat->atomTypes));
     freeDeviceBuffer(&(nb->atdat->shiftVec));
-    sfree(nb->atdat);
+    if (useLjCombRule(nb->nbparam->vdwType))
+    {
+        freeDeviceBuffer(&atdat->ljComb);
+    }
+    else
+    {
+        freeDeviceBuffer(&atdat->atomTypes);
+    }
 
     /* Free nbparam */
-    freeDeviceBuffer(&(nb->nbparam->nbfp));
-    freeDeviceBuffer(&(nb->nbparam->nbfp_comb));
-    freeDeviceBuffer(&(nb->nbparam->coulomb_tab));
-    sfree(nb->nbparam);
+    if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
+    {
+        destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
+    }
+
+    if (!useLjCombRule(nb->nbparam->vdwType))
+    {
+        destroyParamLookupTable(&nbparam->nbfp, nbparam->nbfp_texobj);
+    }
+
+    if (nbparam->vdwType == VdwType::EwaldGeom || nbparam->vdwType == VdwType::EwaldLB)
+    {
+        destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
+    }
 
     /* Free plist */
     auto* plist = nb->plist[InteractionLocality::Local];
@@ -441,7 +297,7 @@ void gpu_free(NbnxmGpu* nb)
     freeDeviceBuffer(&plist->cj4);
     freeDeviceBuffer(&plist->imask);
     freeDeviceBuffer(&plist->excl);
-    sfree(plist);
+    delete plist;
     if (nb->bUseTwoStreams)
     {
         auto* plist_nl = nb->plist[InteractionLocality::NonLocal];
@@ -449,7 +305,7 @@ void gpu_free(NbnxmGpu* nb)
         freeDeviceBuffer(&plist_nl->cj4);
         freeDeviceBuffer(&plist_nl->imask);
         freeDeviceBuffer(&plist_nl->excl);
-        sfree(plist_nl);
+        delete plist_nl;
     }
 
     /* Free nbst */
@@ -465,9 +321,8 @@ void gpu_free(NbnxmGpu* nb)
     freeGpuProgram(nb->dev_rundata->program);
     delete nb->dev_rundata;
 
-    /* Free timers and timings */
-    delete nb->timers;
-    sfree(nb->timings);
+    delete atdat;
+    delete nbparam;
     delete nb;
 
     if (debug)
index 2f37a0c011c90f8a9b7e7575b62d06ed1b92bad1..3998f833b51f7ab18ef723b8e0a96ce78bedd9eb 100644 (file)
@@ -41,7 +41,6 @@
  */
 #include "gmxpre.h"
 
-#include "gromacs/gpu_utils/device_stream_manager.h"
 #include "gromacs/gpu_utils/pmalloc.h"
 #include "gromacs/hardware/device_information.h"
 #include "gromacs/mdtypes/interaction_const.h"
 namespace Nbnxm
 {
 
-/*! \brief Initialize \p atomdata first time; it only gets filled at pair-search. */
-static void initAtomdataFirst(NBAtomData*          atomdata,
-                              int                  numTypes,
-                              const DeviceContext& deviceContext,
-                              const DeviceStream&  localStream)
+void gpu_init_platform_specific(NbnxmGpu* /* nb */)
 {
-    atomdata->numTypes = numTypes;
-    allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
-    atomdata->shiftVecUploaded = false;
-
-    allocateDeviceBuffer(&atomdata->fShift, SHIFTS, deviceContext);
-    allocateDeviceBuffer(&atomdata->eLJ, 1, deviceContext);
-    allocateDeviceBuffer(&atomdata->eElec, 1, deviceContext);
-
-    clearDeviceBufferAsync(&atomdata->fShift, 0, SHIFTS, localStream);
-    clearDeviceBufferAsync(&atomdata->eElec, 0, 1, localStream);
-    clearDeviceBufferAsync(&atomdata->eLJ, 0, 1, localStream);
-
-    /* initialize to nullptr pointers to data that is not allocated here and will
-       need reallocation in later */
-    atomdata->xq = nullptr;
-    atomdata->f  = nullptr;
-
-    /* size -1 indicates that the respective array hasn't been initialized yet */
-    atomdata->numAtoms      = -1;
-    atomdata->numAtomsAlloc = -1;
-}
-
-/*! \brief Initialize the nonbonded parameter data structure. */
-static void initNbparam(NBParamGpu*                     nbp,
-                        const interaction_const_t&      ic,
-                        const PairlistParams&           listParams,
-                        const nbnxn_atomdata_t::Params& nbatParams,
-                        const DeviceContext&            deviceContext)
-{
-    const int numTypes = nbatParams.numTypes;
-
-    set_cutoff_parameters(nbp, &ic, listParams);
-
-    nbp->vdwType  = nbnxmGpuPickVdwKernelType(&ic, nbatParams.ljCombinationRule);
-    nbp->elecType = nbnxmGpuPickElectrostaticsKernelType(&ic, deviceContext.deviceInfo());
-
-    /* generate table for PME */
-    nbp->coulomb_tab = nullptr;
-    if (nbp->elecType == ElecType::EwaldTab || nbp->elecType == ElecType::EwaldTabTwin)
-    {
-        GMX_RELEASE_ASSERT(ic.coulombEwaldTables, "Need valid Coulomb Ewald correction tables");
-        init_ewald_coulomb_force_table(*ic.coulombEwaldTables, nbp, deviceContext);
-    }
-
-    /* set up LJ parameter lookup table */
-    if (!useLjCombRule(nbp->vdwType))
-    {
-        static_assert(sizeof(decltype(nbp->nbfp)) == 2 * sizeof(decltype(*nbatParams.nbfp.data())),
-                      "Mismatch in the size of host / device data types");
-        initParamLookupTable(&nbp->nbfp,
-                             &nbp->nbfp_texobj,
-                             reinterpret_cast<const Float2*>(nbatParams.nbfp.data()),
-                             numTypes * numTypes,
-                             deviceContext);
-    }
-
-    /* set up LJ-PME parameter lookup table */
-    if (ic.vdwtype == VanDerWaalsType::Pme)
-    {
-        static_assert(sizeof(decltype(nbp->nbfp_comb))
-                              == 2 * sizeof(decltype(*nbatParams.nbfp_comb.data())),
-                      "Mismatch in the size of host / device data types");
-        initParamLookupTable(&nbp->nbfp_comb,
-                             &nbp->nbfp_comb_texobj,
-                             reinterpret_cast<const Float2*>(nbatParams.nbfp_comb.data()),
-                             numTypes,
-                             deviceContext);
-    }
-}
-
-NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
-                   const interaction_const_t*      ic,
-                   const PairlistParams&           listParams,
-                   const nbnxn_atomdata_t*         nbat,
-                   const bool                      bLocalAndNonlocal)
-{
-    auto* nb                              = new NbnxmGpu();
-    nb->deviceContext_                    = &deviceStreamManager.context();
-    nb->atdat                             = new NBAtomData;
-    nb->nbparam                           = new NBParamGpu;
-    nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
-    if (bLocalAndNonlocal)
-    {
-        nb->plist[InteractionLocality::NonLocal] = new Nbnxm::gpu_plist;
-    }
-
-    nb->bUseTwoStreams = bLocalAndNonlocal;
-
-    nb->timers  = nullptr;
-    nb->timings = nullptr;
-
-    /* init nbst */
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
-
-    init_plist(nb->plist[InteractionLocality::Local]);
-
-    /* local/non-local GPU streams */
-    GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedLocal),
-                       "Local non-bonded stream should be initialized to use GPU for non-bonded.");
-    const DeviceStream& localStream = deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedLocal);
-    nb->deviceStreams[InteractionLocality::Local] = &localStream;
-    // In general, it's not strictly necessary to use 2 streams for SYCL, since they are
-    // out-of-order. But for the time being, it will be less disruptive to keep them.
-    if (nb->bUseTwoStreams)
-    {
-        init_plist(nb->plist[InteractionLocality::NonLocal]);
-
-        GMX_RELEASE_ASSERT(deviceStreamManager.streamIsValid(gmx::DeviceStreamType::NonBondedNonLocal),
-                           "Non-local non-bonded stream should be initialized to use GPU for "
-                           "non-bonded with domain decomposition.");
-        nb->deviceStreams[InteractionLocality::NonLocal] =
-                &deviceStreamManager.stream(gmx::DeviceStreamType::NonBondedNonLocal);
-    }
-
-    nb->bDoTime = false;
-
-    const nbnxn_atomdata_t::Params& nbatParams    = nbat->params();
-    const DeviceContext&            deviceContext = *nb->deviceContext_;
-
-    initNbparam(nb->nbparam, *ic, listParams, nbatParams, deviceContext);
-    initAtomdataFirst(nb->atdat, nbatParams.numTypes, deviceContext, localStream);
-
-    return nb;
+    // Nothing specific in SYCL
 }
 
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
@@ -218,11 +89,13 @@ void gpu_free(NbnxmGpu* nb)
         return;
     }
 
+    delete nb->timers;
+    sfree(nb->timings);
+
     NBAtomData* atdat   = nb->atdat;
     NBParamGpu* nbparam = nb->nbparam;
 
-    if ((!nbparam->coulomb_tab)
-        && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
+    if (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)
     {
         destroyParamLookupTable(&nbparam->coulomb_tab, nbparam->coulomb_tab_texobj);
     }