From cf8c4835899db2eab12091c5054a9c5b6c7081e4 Mon Sep 17 00:00:00 2001
From: Artem Zhmurov <zhmurov@gmail.com>
Date: Mon, 22 Feb 2021 14:15:16 +0000
Subject: [PATCH] Unify NB atoms and staging data structures in OpenCL, CUDA
 and SYCL

Refs #2608
---
 src/gromacs/nbnxm/cuda/nbnxm_cuda.cu          |  30 ++---
 .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu        | 109 +++++++++---------
 src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh  |  18 +--
 .../nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu |   6 +-
 .../cuda/nbnxm_cuda_kernel_pruneonly.cuh      |   8 +-
 src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h     |  65 +----------
 src/gromacs/nbnxm/gpu_common.h                |  15 ++-
 src/gromacs/nbnxm/gpu_types_common.h          |  54 +++++++++
 src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp        |  56 ++++-----
 .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp      |  96 +++++++--------
 src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h    |  60 +---------
 src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp         |  16 +--
 .../nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp       |  52 ++++-----
 src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp  |   2 +-
 .../sycl/nbnxm_sycl_kernel_pruneonly.cpp      |   2 +-
 src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h     |  58 +---------
 16 files changed, 263 insertions(+), 384 deletions(-)

diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
index 62d50f039d..594c4ca291 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu
@@ -122,7 +122,7 @@ namespace Nbnxm
 constexpr static int c_bufOpsThreadsPerBlock = 128;
 
 /*! Nonbonded kernel function pointer type */
-typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const NBParamGpu, const gpu_plist, bool);
+typedef void (*nbnxn_cu_kfunc_ptr_t)(const NBAtomData, const NBParamGpu, const gpu_plist, bool);
 
 /*********************************/
 
@@ -456,7 +456,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 
     int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */
 
-    cu_atomdata_t*      adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     gpu_plist*          plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
@@ -488,12 +488,12 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     if (atomLocality == AtomLocality::Local)
     {
         adat_begin = 0;
-        adat_len   = adat->natoms_local;
+        adat_len   = adat->numAtomsLocal;
     }
     else
     {
-        adat_begin = adat->natoms_local;
-        adat_len   = adat->natoms - adat->natoms_local;
+        adat_begin = adat->numAtomsLocal;
+        adat_len   = adat->numAtoms - adat->numAtomsLocal;
     }
 
     /* beginning of timed HtoD section */
@@ -546,7 +546,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
  */
 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
 {
-    cu_atomdata_t*      adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
@@ -667,7 +667,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
 
 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
 {
-    cu_atomdata_t*      adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
     cu_timers_t*        t            = nb->timers;
@@ -811,7 +811,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                "beginning of the copy back function.");
 
     /* extract the data */
-    cu_atomdata_t*      adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     cu_timers_t*        t            = nb->timers;
     bool                bDoTime      = nb->bDoTime;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
@@ -872,24 +872,24 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH fshift when virial is needed */
         if (stepWork.computeVirial)
         {
-            static_assert(sizeof(nb->nbst.fshift[0]) == sizeof(adat->fshift[0]),
+            static_assert(sizeof(nb->nbst.fShift[0]) == sizeof(adat->fShift[0]),
                           "Sizes of host- and device-side shift vectors should be the same.");
             copyFromDeviceBuffer(
-                    nb->nbst.fshift, &adat->fshift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
+                    nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
         }
 
         /* DtoH energies */
         if (stepWork.computeEnergy)
         {
-            static_assert(sizeof(nb->nbst.e_lj[0]) == sizeof(adat->e_lj[0]),
+            static_assert(sizeof(nb->nbst.eLJ[0]) == sizeof(adat->eLJ[0]),
                           "Sizes of host- and device-side LJ energy terms should be the same.");
             copyFromDeviceBuffer(
-                    nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
-            static_assert(sizeof(nb->nbst.e_el[0]) == sizeof(adat->e_el[0]),
+                    nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+            static_assert(sizeof(nb->nbst.eElec[0]) == sizeof(adat->eElec[0]),
                           "Sizes of host- and device-side electrostatic energy terms should be the "
                           "same.");
             copyFromDeviceBuffer(
-                    nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+                    nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
         }
     }
 
@@ -929,7 +929,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid&        grid,
 {
     GMX_ASSERT(nb, "Need a valid nbnxn_gpu object");
 
-    cu_atomdata_t* adat = nb->atdat;
+    NBAtomData* adat = nb->atdat;
 
     const int                  numColumns      = grid.numColumns();
     const int                  cellOffset      = grid.cellOffset();
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
index db8654440e..2505422927 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu
@@ -97,15 +97,15 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb);
 
 /*! Initializes the atomdata structure first time, it only gets filled at
     pair-search. */
-static void init_atomdata_first(cu_atomdata_t* ad, int ntypes, const DeviceContext& deviceContext)
+static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext)
 {
-    ad->ntypes = ntypes;
-    allocateDeviceBuffer(&ad->shift_vec, SHIFTS, deviceContext);
-    ad->bShiftVecUploaded = false;
+    ad->numTypes = ntypes;
+    allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext);
+    ad->shiftVecUploaded = false;
 
-    allocateDeviceBuffer(&ad->fshift, SHIFTS, deviceContext);
-    allocateDeviceBuffer(&ad->e_lj, 1, deviceContext);
-    allocateDeviceBuffer(&ad->e_el, 1, deviceContext);
+    allocateDeviceBuffer(&ad->fShift, SHIFTS, deviceContext);
+    allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
+    allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
 
     /* initialize to nullptr poiters to data that is not allocated here and will
        need reallocation in nbnxn_cuda_init_atomdata */
@@ -113,8 +113,8 @@ static void init_atomdata_first(cu_atomdata_t* ad, int ntypes, const DeviceConte
     ad->f  = nullptr;
 
     /* size -1 indicates that the respective array hasn't been initialized yet */
-    ad->natoms = -1;
-    ad->nalloc = -1;
+    ad->numAtoms      = -1;
+    ad->numAtomsAlloc = -1;
 }
 
 /*! Initializes the nonbonded parameter data structure. */
@@ -198,9 +198,9 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     snew(nb->timings, 1);
 
     /* init nbst */
-    pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj));
-    pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el));
-    pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift));
+    pmalloc((void**)&nb->nbst.eLJ, sizeof(*nb->nbst.eLJ));
+    pmalloc((void**)&nb->nbst.eElec, sizeof(*nb->nbst.eElec));
+    pmalloc((void**)&nb->nbst.fShift, SHIFTS * sizeof(*nb->nbst.fShift));
 
     init_plist(nb->plist[InteractionLocality::Local]);
 
@@ -259,29 +259,29 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
-    cu_atomdata_t*      adat        = nb->atdat;
+    NBAtomData*         adat        = nb->atdat;
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 
     /* only if we have a dynamic box */
-    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+    if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
     {
-        static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]),
+        static_assert(sizeof(adat->shiftVec[0]) == sizeof(nbatom->shift_vec[0]),
                       "Sizes of host- and device-side shift vectors should be the same.");
-        copyToDeviceBuffer(&adat->shift_vec,
+        copyToDeviceBuffer(&adat->shiftVec,
                            reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
                            0,
                            SHIFTS,
                            localStream,
                            GpuApiCallBehavior::Async,
                            nullptr);
-        adat->bShiftVecUploaded = true;
+        adat->shiftVecUploaded = true;
     }
 }
 
 /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */
 static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
 {
-    cu_atomdata_t*      adat        = nb->atdat;
+    NBAtomData*         adat        = nb->atdat;
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
     clearDeviceBufferAsync(&adat->f, 0, natoms_clear, localStream);
 }
@@ -289,17 +289,17 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear)
 /*! Clears nonbonded shift force output array and energy outputs on the GPU. */
 static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb)
 {
-    cu_atomdata_t*      adat        = nb->atdat;
+    NBAtomData*         adat        = nb->atdat;
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 
-    clearDeviceBufferAsync(&adat->fshift, 0, SHIFTS, localStream);
-    clearDeviceBufferAsync(&adat->e_lj, 0, 1, localStream);
-    clearDeviceBufferAsync(&adat->e_el, 0, 1, localStream);
+    clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream);
+    clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream);
+    clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream);
 }
 
 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 {
-    nbnxn_cuda_clear_f(nb, nb->atdat->natoms);
+    nbnxn_cuda_clear_f(nb, nb->atdat->numAtoms);
     /* clear shift force array and energies if the outputs were
        used in the current step */
     if (computeVirial)
@@ -314,7 +314,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     bool                 realloced;
     bool                 bDoTime       = nb->bDoTime;
     cu_timers_t*         timers        = nb->timers;
-    cu_atomdata_t*       d_atdat       = nb->atdat;
+    NBAtomData*          d_atdat       = nb->atdat;
     const DeviceContext& deviceContext = *nb->deviceContext_;
     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 
@@ -329,36 +329,36 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     /* need to reallocate if we have to copy more atoms than the amount of space
        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
-    if (natoms > d_atdat->nalloc)
+    if (natoms > d_atdat->numAtomsAlloc)
     {
         nalloc = over_alloc_small(natoms);
 
         /* free up first if the arrays have already been initialized */
-        if (d_atdat->nalloc != -1)
+        if (d_atdat->numAtomsAlloc != -1)
         {
             freeDeviceBuffer(&d_atdat->f);
             freeDeviceBuffer(&d_atdat->xq);
-            freeDeviceBuffer(&d_atdat->atom_types);
-            freeDeviceBuffer(&d_atdat->lj_comb);
+            freeDeviceBuffer(&d_atdat->atomTypes);
+            freeDeviceBuffer(&d_atdat->ljComb);
         }
 
         allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext);
         allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext);
         if (useLjCombRule(nb->nbparam->vdwType))
         {
-            allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext);
+            allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext);
         }
         else
         {
-            allocateDeviceBuffer(&d_atdat->atom_types, nalloc, deviceContext);
+            allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext);
         }
 
-        d_atdat->nalloc = nalloc;
-        realloced       = true;
+        d_atdat->numAtomsAlloc = nalloc;
+        realloced              = true;
     }
 
-    d_atdat->natoms       = natoms;
-    d_atdat->natoms_local = nbat->natoms_local;
+    d_atdat->numAtoms      = natoms;
+    d_atdat->numAtomsLocal = nbat->natoms_local;
 
     /* need to clear GPU f output if realloc happened */
     if (realloced)
@@ -368,9 +368,9 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     if (useLjCombRule(nb->nbparam->vdwType))
     {
-        static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(Float2),
+        static_assert(sizeof(d_atdat->ljComb[0]) == sizeof(Float2),
                       "Size of the LJ parameters element should be equal to the size of float2.");
-        copyToDeviceBuffer(&d_atdat->lj_comb,
+        copyToDeviceBuffer(&d_atdat->ljComb,
                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
                            0,
                            natoms,
@@ -380,9 +380,9 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     }
     else
     {
-        static_assert(sizeof(d_atdat->atom_types[0]) == sizeof(nbat->params().type[0]),
+        static_assert(sizeof(d_atdat->atomTypes[0]) == sizeof(nbat->params().type[0]),
                       "Sizes of host- and device-side atom types should be the same.");
-        copyToDeviceBuffer(&d_atdat->atom_types,
+        copyToDeviceBuffer(&d_atdat->atomTypes,
                            nbat->params().type.data(),
                            0,
                            natoms,
@@ -399,16 +399,13 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
 void gpu_free(NbnxmGpu* nb)
 {
-    cu_atomdata_t* atdat;
-    NBParamGpu*    nbparam;
-
     if (nb == nullptr)
     {
         return;
     }
 
-    atdat   = nb->atdat;
-    nbparam = nb->nbparam;
+    NBAtomData* atdat   = nb->atdat;
+    NBParamGpu* nbparam = nb->nbparam;
 
     if ((!nbparam->coulomb_tab)
         && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
@@ -428,16 +425,16 @@ void gpu_free(NbnxmGpu* nb)
         destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj);
     }
 
-    freeDeviceBuffer(&atdat->shift_vec);
-    freeDeviceBuffer(&atdat->fshift);
+    freeDeviceBuffer(&atdat->shiftVec);
+    freeDeviceBuffer(&atdat->fShift);
 
-    freeDeviceBuffer(&atdat->e_lj);
-    freeDeviceBuffer(&atdat->e_el);
+    freeDeviceBuffer(&atdat->eLJ);
+    freeDeviceBuffer(&atdat->eElec);
 
     freeDeviceBuffer(&atdat->f);
     freeDeviceBuffer(&atdat->xq);
-    freeDeviceBuffer(&atdat->atom_types);
-    freeDeviceBuffer(&atdat->lj_comb);
+    freeDeviceBuffer(&atdat->atomTypes);
+    freeDeviceBuffer(&atdat->ljComb);
 
     /* Free plist */
     auto* plist = nb->plist[InteractionLocality::Local];
@@ -457,14 +454,14 @@ void gpu_free(NbnxmGpu* nb)
     }
 
     /* Free nbst */
-    pfree(nb->nbst.e_lj);
-    nb->nbst.e_lj = nullptr;
+    pfree(nb->nbst.eLJ);
+    nb->nbst.eLJ = nullptr;
 
-    pfree(nb->nbst.e_el);
-    nb->nbst.e_el = nullptr;
+    pfree(nb->nbst.eElec);
+    nb->nbst.eElec = nullptr;
 
-    pfree(nb->nbst.fshift);
-    nb->nbst.fshift = nullptr;
+    pfree(nb->nbst.fShift);
+    nb->nbst.fShift = nullptr;
 
     sfree(atdat);
     sfree(nbparam);
@@ -501,7 +498,7 @@ DeviceBuffer<gmx::RVec> gpu_get_fshift(NbnxmGpu* nb)
 {
     assert(nb);
 
-    return reinterpret_cast<DeviceBuffer<gmx::RVec>>(nb->atdat->fshift);
+    return reinterpret_cast<DeviceBuffer<gmx::RVec>>(nb->atdat->fShift);
 }
 
 /* Initialization for X buffer operations on GPU. */
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
index 688e094715..344e971c84 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh
@@ -159,7 +159,7 @@ __launch_bounds__(THREADS_PER_BLOCK)
         __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda)
 #    endif /* CALC_ENERGIES */
 #endif     /* PRUNE_NBL */
-                (const cu_atomdata_t atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift)
+                (const NBAtomData atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift)
 #ifdef FUNCTION_DECLARATION_ONLY
                         ; /* Only do function declaration, omit the function body. */
 #else
@@ -172,15 +172,15 @@ __launch_bounds__(THREADS_PER_BLOCK)
             nbnxn_cj4_t* pl_cj4      = plist.cj4;
     const nbnxn_excl_t*  excl        = plist.excl;
 #    ifndef LJ_COMB
-    const int*           atom_types  = atdat.atom_types;
-    int                  ntypes      = atdat.ntypes;
+    const int*           atom_types  = atdat.atomTypes;
+    int                  ntypes      = atdat.numTypes;
 #    else
-    const float2* lj_comb = atdat.lj_comb;
+    const float2* lj_comb = atdat.ljComb;
     float2        ljcp_i, ljcp_j;
 #    endif
     const float4*        xq          = atdat.xq;
     float3*              f           = asFloat3(atdat.f);
-    const float3*        shift_vec   = asFloat3(atdat.shift_vec);
+    const float3*        shift_vec   = asFloat3(atdat.shiftVec);
     float                rcoulomb_sq = nbparam.rcoulomb_sq;
 #    ifdef VDW_CUTOFF_CHECK
     float                rvdw_sq     = nbparam.rvdw_sq;
@@ -207,8 +207,8 @@ __launch_bounds__(THREADS_PER_BLOCK)
 #        else
     float reactionFieldShift = nbparam.c_rf;
 #        endif /* EL_EWALD_ANY */
-    float*               e_lj        = atdat.e_lj;
-    float*               e_el        = atdat.e_el;
+    float*               e_lj        = atdat.eLJ;
+    float*               e_el        = atdat.eElec;
 #    endif     /* CALC_ENERGIES */
 
     /* thread/block/warp id-s */
@@ -649,8 +649,8 @@ __launch_bounds__(THREADS_PER_BLOCK)
     /* add up local shift forces into global mem, tidxj indexes x,y,z */
     if (bCalcFshift && (tidxj & 3) < 3)
     {
-        float3* fshift = asFloat3(atdat.fshift);
-        atomicAdd(&(fshift[nb_sci.shift].x) + (tidxj & 3), fshift_buf);
+        float3* fShift = asFloat3(atdat.fShift);
+        atomicAdd(&(fShift[nb_sci.shift].x) + (tidxj & 3), fshift_buf);
     }
 
 #    ifdef CALC_ENERGIES
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu
index fb8ebb2e76..2ff980a6b0 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu
@@ -1,7 +1,7 @@
 /*
  * This file is part of the GROMACS molecular simulation package.
  *
- * Copyright (c) 2016,2017,2019,2020, by the GROMACS development team, led by
+ * Copyright (c) 2016,2017,2019,2020,2021, by the GROMACS development team, led by
  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
  * and including many others, as listed in the AUTHORS file in the
  * top-level source directory and at http://www.gromacs.org.
@@ -39,7 +39,7 @@
 #ifndef FUNCTION_DECLARATION_ONLY
 /* Instantiate external template functions */
 template __global__ void
-nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
+nbnxn_kernel_prune_cuda<false>(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
 template __global__ void
-nbnxn_kernel_prune_cuda<true>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
+nbnxn_kernel_prune_cuda<true>(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
 #endif
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
index 11c51227f4..c5c55e667f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh
@@ -104,7 +104,7 @@
  */
 template<bool haveFreshList>
 __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__
-        void nbnxn_kernel_prune_cuda(const cu_atomdata_t    atdat,
+        void nbnxn_kernel_prune_cuda(const NBAtomData       atdat,
                                      const NBParamGpu       nbparam,
                                      const Nbnxm::gpu_plist plist,
                                      int                    numParts,
@@ -115,9 +115,9 @@ __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__
 // Add extern declarations so each translation unit understands that
 // there will be a definition provided.
 extern template __global__ void
-nbnxn_kernel_prune_cuda<true>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
+nbnxn_kernel_prune_cuda<true>(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
 extern template __global__ void
-nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
+nbnxn_kernel_prune_cuda<false>(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int);
 #else
 {
 
@@ -125,7 +125,7 @@ nbnxn_kernel_prune_cuda<false>(const cu_atomdata_t, const NBParamGpu, const Nbnx
     const nbnxn_sci_t* pl_sci    = plist.sci;
     nbnxn_cj4_t*       pl_cj4    = plist.cj4;
     const float4*      xq        = atdat.xq;
-    const float3*      shift_vec = asFloat3(atdat.shift_vec);
+    const float3*      shift_vec = asFloat3(atdat.shiftVec);
 
     float rlistOuter_sq = nbparam.rlistOuter_sq;
     float rlistInner_sq = nbparam.rlistInner_sq;
diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
index f1b1a6db81..08d96de90f 100644
--- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
+++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h
@@ -64,67 +64,6 @@
 /*! \brief cluster size = number of atoms per cluster. */
 static constexpr int c_clSize = c_nbnxnGpuClusterSize;
 
-/* All structs prefixed with "cu_" hold data used in GPU calculations and
- * are passed to the kernels, except cu_timers_t. */
-/*! \cond */
-typedef struct cu_atomdata cu_atomdata_t;
-/*! \endcond */
-
-
-/** \internal
- * \brief Staging area for temporary data downloaded from the GPU.
- *
- *  The energies/shift forces get downloaded here first, before getting added
- *  to the CPU-side aggregate values.
- */
-struct nb_staging_t
-{
-    //! LJ energy
-    float* e_lj = nullptr;
-    //! electrostatic energy
-    float* e_el = nullptr;
-    //! shift forces
-    Float3* fshift = nullptr;
-};
-
-/** \internal
- * \brief Nonbonded atom data - both inputs and outputs.
- */
-struct cu_atomdata
-{
-    //! number of atoms
-    int natoms;
-    //! number of local atoms
-    int natoms_local;
-    //! allocation size for the atom data (xq, f)
-    int nalloc;
-
-    //! atom coordinates + charges, size natoms
-    DeviceBuffer<Float4> xq;
-    //! force output array, size natoms
-    DeviceBuffer<Float3> f;
-
-    //! LJ energy output, size 1
-    DeviceBuffer<float> e_lj;
-    //! Electrostatics energy input, size 1
-    DeviceBuffer<float> e_el;
-
-    //! shift forces
-    DeviceBuffer<Float3> fshift;
-
-    //! number of atom types
-    int ntypes;
-    //! atom type indices, size natoms
-    DeviceBuffer<int> atom_types;
-    //! sqrt(c6),sqrt(c12) size natoms
-    DeviceBuffer<Float2> lj_comb;
-
-    //! shifts
-    DeviceBuffer<Float3> shift_vec;
-    //! true if the shift vector has been uploaded
-    bool bShiftVecUploaded;
-};
-
 /** \internal
  * \brief Typedef of actual timer type.
  */
@@ -146,7 +85,7 @@ struct NbnxmGpu
     bool bNonLocalStreamDoneMarked = false;
 
     /*! \brief atom data */
-    cu_atomdata_t* atdat = nullptr;
+    NBAtomData* atdat = nullptr;
     /*! \brief array of atom indices */
     int* atomIndices = nullptr;
     /*! \brief size of atom indices */
@@ -170,7 +109,7 @@ struct NbnxmGpu
     /*! \brief pair-list data structures (local and non-local) */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { { nullptr } };
     /*! \brief staging area where fshift/energies get downloaded */
-    nb_staging_t nbst;
+    NBStagingData nbst;
     /*! \brief local and non-local GPU streams */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
 
diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h
index bb1eb874ae..dccfe1eed0 100644
--- a/src/gromacs/nbnxm/gpu_common.h
+++ b/src/gromacs/nbnxm/gpu_common.h
@@ -183,12 +183,12 @@ static inline void getGpuAtomRange(const AtomDataT*   atomData,
     if (atomLocality == AtomLocality::Local)
     {
         *atomRangeBegin = 0;
-        *atomRangeLen   = atomData->natoms_local;
+        *atomRangeLen   = atomData->numAtomsLocal;
     }
     else
     {
-        *atomRangeBegin = atomData->natoms_local;
-        *atomRangeLen   = atomData->natoms - atomData->natoms_local;
+        *atomRangeBegin = atomData->numAtomsLocal;
+        *atomRangeLen   = atomData->numAtoms - atomData->numAtomsLocal;
     }
 }
 
@@ -241,7 +241,6 @@ static void countPruneKernelTime(GpuTimers*                 timers,
  * Note that this function should always be called after the transfers into the
  * staging buffers has completed.
  *
- * \tparam     StagingData    Type of staging data
  * \param[in]  nbst           Nonbonded staging data
  * \param[in]  iLocality      Interaction locality specifier
  * \param[in]  reduceEnergies True if energy reduction should be done
@@ -250,7 +249,7 @@ static void countPruneKernelTime(GpuTimers*                 timers,
  * \param[out] e_el           Variable to accumulate electrostatic energy into
  * \param[out] fshift         Pointer to the array of shift forces to accumulate into
  */
-static inline void gpu_reduce_staged_outputs(const nb_staging_t&       nbst,
+static inline void gpu_reduce_staged_outputs(const NBStagingData&      nbst,
                                              const InteractionLocality iLocality,
                                              const bool                reduceEnergies,
                                              const bool                reduceFshift,
@@ -263,15 +262,15 @@ static inline void gpu_reduce_staged_outputs(const nb_staging_t&       nbst,
     {
         if (reduceEnergies)
         {
-            *e_lj += *nbst.e_lj;
-            *e_el += *nbst.e_el;
+            *e_lj += *nbst.eLJ;
+            *e_el += *nbst.eElec;
         }
 
         if (reduceFshift)
         {
             for (int i = 0; i < SHIFTS; i++)
             {
-                rvec_inc(fshift[i], nbst.fshift[i]);
+                rvec_inc(fshift[i], nbst.fShift[i]);
             }
         }
     }
diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h
index ebd5db9b34..85c5853ebd 100644
--- a/src/gromacs/nbnxm/gpu_types_common.h
+++ b/src/gromacs/nbnxm/gpu_types_common.h
@@ -73,6 +73,60 @@
 //! Default for the prune kernel's j4 processing concurrency.
 static constexpr int c_pruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY;
 
+/*! \internal
+ * \brief Staging area for temporary data downloaded from the GPU.
+ *
+ * Since SYCL buffers already have host-side storage, this is a bit redundant.
+ * But it allows prefetching of the data from GPU, and brings GPU backends closer together.
+ */
+struct NBStagingData
+{
+    //! LJ energy
+    float* eLJ = nullptr;
+    //! electrostatic energy
+    float* eElec = nullptr;
+    //! shift forces
+    Float3* fShift = nullptr;
+};
+
+/** \internal
+ * \brief Nonbonded atom data - both inputs and outputs.
+ */
+struct NBAtomData
+{
+    //! number of atoms
+    int numAtoms;
+    //! number of local atoms
+    int numAtomsLocal;
+    //! allocation size for the atom data (xq, f)
+    int numAtomsAlloc;
+
+    //! atom coordinates + charges, size \ref numAtoms
+    DeviceBuffer<Float4> xq;
+    //! force output array, size \ref numAtoms
+    DeviceBuffer<Float3> f;
+
+    //! LJ energy output, size 1
+    DeviceBuffer<float> eLJ;
+    //! Electrostatics energy input, size 1
+    DeviceBuffer<float> eElec;
+
+    //! shift forces
+    DeviceBuffer<Float3> fShift;
+
+    //! number of atom types
+    int numTypes;
+    //! atom type indices, size \ref numAtoms
+    DeviceBuffer<int> atomTypes;
+    //! sqrt(c6),sqrt(c12) size \ref numAtoms
+    DeviceBuffer<Float2> ljComb;
+
+    //! shifts
+    DeviceBuffer<Float3> shiftVec;
+    //! true if the shift vector has been uploaded
+    bool shiftVecUploaded;
+};
+
 /** \internal
  * \brief Parameters required for the GPU nonbonded calculations.
  */
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
index 6cdad01019..e00874a30b 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp
@@ -530,7 +530,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     /* local/nonlocal offset and length used for xq and f */
     int adat_begin, adat_len;
 
-    cl_atomdata_t*      adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     gpu_plist*          plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
@@ -562,12 +562,12 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
     if (atomLocality == AtomLocality::Local)
     {
         adat_begin = 0;
-        adat_len   = adat->natoms_local;
+        adat_len   = adat->numAtomsLocal;
     }
     else
     {
-        adat_begin = adat->natoms_local;
-        adat_len   = adat->natoms - adat->natoms_local;
+        adat_begin = adat->numAtomsLocal;
+        adat_len   = adat->numAtoms - adat->numAtomsLocal;
     }
 
     /* beginning of timed HtoD section */
@@ -622,7 +622,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
  */
 void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc)
 {
-    cl_atomdata_t*      adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
@@ -717,11 +717,11 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
                                                           &nbparams_params,
                                                           &adat->xq,
                                                           &adat->f,
-                                                          &adat->e_lj,
-                                                          &adat->e_el,
-                                                          &adat->fshift,
-                                                          &adat->lj_comb,
-                                                          &adat->shift_vec,
+                                                          &adat->eLJ,
+                                                          &adat->eElec,
+                                                          &adat->fShift,
+                                                          &adat->ljComb,
+                                                          &adat->shiftVec,
                                                           &nbp->nbfp,
                                                           &nbp->nbfp_comb,
                                                           &nbp->coulomb_tab,
@@ -736,15 +736,15 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb
     {
         const auto kernelArgs = prepareGpuKernelArguments(kernel,
                                                           config,
-                                                          &adat->ntypes,
+                                                          &adat->numTypes,
                                                           &nbparams_params,
                                                           &adat->xq,
                                                           &adat->f,
-                                                          &adat->e_lj,
-                                                          &adat->e_el,
-                                                          &adat->fshift,
-                                                          &adat->atom_types,
-                                                          &adat->shift_vec,
+                                                          &adat->eLJ,
+                                                          &adat->eElec,
+                                                          &adat->fShift,
+                                                          &adat->atomTypes,
+                                                          &adat->shiftVec,
                                                           &nbp->nbfp,
                                                           &nbp->nbfp_comb,
                                                           &nbp->coulomb_tab,
@@ -793,7 +793,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z)
  */
 void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts)
 {
-    cl_atomdata_t*      adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
     cl_timers_t*        t            = nb->timers;
@@ -898,7 +898,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c
                                                       config,
                                                       &nbparams_params,
                                                       &adat->xq,
-                                                      &adat->shift_vec,
+                                                      &adat->shiftVec,
                                                       &plist->sci,
                                                       &plist->cj4,
                                                       &plist->imask,
@@ -945,7 +945,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                "Non-local stream is indicating that the copy back event is enqueued at the "
                "beginning of the copy back function.");
 
-    cl_atomdata_t*      adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     cl_timers_t*        t            = nb->timers;
     bool                bDoTime      = nb->bDoTime;
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
@@ -1013,10 +1013,10 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         if (stepWork.computeVirial)
         {
             static_assert(
-                    sizeof(*nb->nbst.fshift) == sizeof(Float3),
+                    sizeof(*nb->nbst.fShift) == sizeof(Float3),
                     "Sizes of host- and device-side shift vector elements should be the same.");
-            copyFromDeviceBuffer(nb->nbst.fshift,
-                                 &adat->fshift,
+            copyFromDeviceBuffer(nb->nbst.fShift,
+                                 &adat->fShift,
                                  0,
                                  SHIFTS,
                                  deviceStream,
@@ -1027,20 +1027,20 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH energies */
         if (stepWork.computeEnergy)
         {
-            static_assert(sizeof(*nb->nbst.e_lj) == sizeof(float),
+            static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float),
                           "Sizes of host- and device-side LJ energy terms should be the same.");
-            copyFromDeviceBuffer(nb->nbst.e_lj,
-                                 &adat->e_lj,
+            copyFromDeviceBuffer(nb->nbst.eLJ,
+                                 &adat->eLJ,
                                  0,
                                  1,
                                  deviceStream,
                                  GpuApiCallBehavior::Async,
                                  bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr);
-            static_assert(sizeof(*nb->nbst.e_el) == sizeof(float),
+            static_assert(sizeof(*nb->nbst.eElec) == sizeof(float),
                           "Sizes of host- and device-side electrostatic energy terms should be the "
                           "same.");
-            copyFromDeviceBuffer(nb->nbst.e_el,
-                                 &adat->e_el,
+            copyFromDeviceBuffer(nb->nbst.eElec,
+                                 &adat->eElec,
                                  0,
                                  1,
                                  deviceStream,
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
index da998182ca..25cb3158b2 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp
@@ -103,16 +103,16 @@ static unsigned int gpu_min_ci_balanced_factor = 50;
 /*! \brief Initializes the atomdata structure first time, it only gets filled at
     pair-search.
  */
-static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, const DeviceContext& deviceContext)
+static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext)
 {
-    ad->ntypes = ntypes;
+    ad->numTypes = ntypes;
 
-    allocateDeviceBuffer(&ad->shift_vec, SHIFTS * DIM, deviceContext);
-    ad->bShiftVecUploaded = CL_FALSE;
+    allocateDeviceBuffer(&ad->shiftVec, SHIFTS * DIM, deviceContext);
+    ad->shiftVecUploaded = false;
 
-    allocateDeviceBuffer(&ad->fshift, SHIFTS * DIM, deviceContext);
-    allocateDeviceBuffer(&ad->e_lj, 1, deviceContext);
-    allocateDeviceBuffer(&ad->e_el, 1, deviceContext);
+    allocateDeviceBuffer(&ad->fShift, SHIFTS * DIM, deviceContext);
+    allocateDeviceBuffer(&ad->eLJ, 1, deviceContext);
+    allocateDeviceBuffer(&ad->eElec, 1, deviceContext);
 
     /* initialize to nullptr pointers to data that is not allocated here and will
        need reallocation in nbnxn_gpu_init_atomdata */
@@ -120,8 +120,8 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, const DeviceConte
     ad->f  = nullptr;
 
     /* size -1 indicates that the respective array hasn't been initialized yet */
-    ad->natoms = -1;
-    ad->nalloc = -1;
+    ad->numAtoms      = -1;
+    ad->numAtomsAlloc = -1;
 }
 
 
@@ -206,7 +206,7 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
 {
 
     cl_int           cl_error;
-    cl_atomdata_t*   adat = nb->atdat;
+    NBAtomData*      adat = nb->atdat;
     cl_command_queue ls   = nb->deviceStreams[InteractionLocality::Local]->stream();
 
     size_t local_work_size[3]  = { 1, 1, 1 };
@@ -223,9 +223,9 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb)
     global_work_size[0] = ((shifts + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0];
 
     arg_no   = 0;
-    cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift));
-    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj));
-    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el));
+    cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fShift));
+    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eLJ));
+    cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eElec));
     cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts);
     GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str());
 
@@ -264,7 +264,7 @@ static void nbnxn_gpu_init_kernels(NbnxmGpu* nb)
  *  Initializes members of the atomdata and nbparam structs and
  *  clears e/fshift output buffers.
  */
-static void nbnxn_ocl_init_const(cl_atomdata_t*                  atomData,
+static void nbnxn_ocl_init_const(NBAtomData*                     atomData,
                                  NBParamGpu*                     nbParams,
                                  const interaction_const_t*      ic,
                                  const PairlistParams&           listParams,
@@ -304,9 +304,9 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     nb->dev_rundata = new gmx_device_runtime_data_t();
 
     /* init nbst */
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
 
     init_plist(nb->plist[InteractionLocality::Local]);
 
@@ -372,7 +372,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
         return;
     }
 
-    cl_atomdata_t*      atomData    = nb->atdat;
+    NBAtomData*         atomData    = nb->atdat;
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 
     clearDeviceBufferAsync(&atomData->f, 0, natoms_clear, localStream);
@@ -381,7 +381,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear)
 //! This function is documented in the header file
 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 {
-    nbnxn_ocl_clear_f(nb, nb->atdat->natoms);
+    nbnxn_ocl_clear_f(nb, nb->atdat->numAtoms);
     /* clear shift force array and energies if the outputs were
        used in the current step */
     if (computeVirial)
@@ -398,22 +398,22 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 //! This function is documented in the header file
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
-    cl_atomdata_t*      adat        = nb->atdat;
+    NBAtomData*         adat        = nb->atdat;
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 
     /* only if we have a dynamic box */
-    if (nbatom->bDynamicBox || !adat->bShiftVecUploaded)
+    if (nbatom->bDynamicBox || !adat->shiftVecUploaded)
     {
         static_assert(sizeof(Float3) == sizeof(nbatom->shift_vec[0]),
                       "Sizes of host- and device-side shift vectors should be the same.");
-        copyToDeviceBuffer(&adat->shift_vec,
+        copyToDeviceBuffer(&adat->shiftVec,
                            reinterpret_cast<const Float3*>(nbatom->shift_vec.data()),
                            0,
                            SHIFTS,
                            localStream,
                            GpuApiCallBehavior::Async,
                            nullptr);
-        adat->bShiftVecUploaded = CL_TRUE;
+        adat->shiftVecUploaded = true;
     }
 }
 
@@ -425,7 +425,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     bool                 realloced;
     bool                 bDoTime       = nb->bDoTime;
     cl_timers_t*         timers        = nb->timers;
-    cl_atomdata_t*       d_atdat       = nb->atdat;
+    NBAtomData*          d_atdat       = nb->atdat;
     const DeviceContext& deviceContext = *nb->deviceContext_;
     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 
@@ -440,17 +440,17 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 
     /* need to reallocate if we have to copy more atoms than the amount of space
        available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */
-    if (natoms > d_atdat->nalloc)
+    if (natoms > d_atdat->numAtomsAlloc)
     {
         nalloc = over_alloc_small(natoms);
 
         /* free up first if the arrays have already been initialized */
-        if (d_atdat->nalloc != -1)
+        if (d_atdat->numAtomsAlloc != -1)
         {
             freeDeviceBuffer(&d_atdat->f);
             freeDeviceBuffer(&d_atdat->xq);
-            freeDeviceBuffer(&d_atdat->lj_comb);
-            freeDeviceBuffer(&d_atdat->atom_types);
+            freeDeviceBuffer(&d_atdat->ljComb);
+            freeDeviceBuffer(&d_atdat->atomTypes);
         }
 
 
@@ -460,19 +460,19 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
         if (useLjCombRule(nb->nbparam->vdwType))
         {
             // Two Lennard-Jones parameters per atom
-            allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext);
+            allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext);
         }
         else
         {
-            allocateDeviceBuffer(&d_atdat->atom_types, nalloc, deviceContext);
+            allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext);
         }
 
-        d_atdat->nalloc = nalloc;
-        realloced       = true;
+        d_atdat->numAtomsAlloc = nalloc;
+        realloced              = true;
     }
 
-    d_atdat->natoms       = natoms;
-    d_atdat->natoms_local = nbat->natoms_local;
+    d_atdat->numAtoms      = natoms;
+    d_atdat->numAtomsLocal = nbat->natoms_local;
 
     /* need to clear GPU f output if realloc happened */
     if (realloced)
@@ -484,7 +484,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     {
         static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()),
                       "Size of the LJ parameters element should be equal to the size of float2.");
-        copyToDeviceBuffer(&d_atdat->lj_comb,
+        copyToDeviceBuffer(&d_atdat->ljComb,
                            reinterpret_cast<const Float2*>(nbat->params().lj_comb.data()),
                            0,
                            natoms,
@@ -496,7 +496,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
     {
         static_assert(sizeof(int) == sizeof(*nbat->params().type.data()),
                       "Sizes of host- and device-side atom types should be the same.");
-        copyToDeviceBuffer(&d_atdat->atom_types,
+        copyToDeviceBuffer(&d_atdat->atomTypes,
                            nbat->params().type.data(),
                            0,
                            natoms,
@@ -592,12 +592,12 @@ void gpu_free(NbnxmGpu* nb)
     /* Free atdat */
     freeDeviceBuffer(&(nb->atdat->xq));
     freeDeviceBuffer(&(nb->atdat->f));
-    freeDeviceBuffer(&(nb->atdat->e_lj));
-    freeDeviceBuffer(&(nb->atdat->e_el));
-    freeDeviceBuffer(&(nb->atdat->fshift));
-    freeDeviceBuffer(&(nb->atdat->lj_comb));
-    freeDeviceBuffer(&(nb->atdat->atom_types));
-    freeDeviceBuffer(&(nb->atdat->shift_vec));
+    freeDeviceBuffer(&(nb->atdat->eLJ));
+    freeDeviceBuffer(&(nb->atdat->eElec));
+    freeDeviceBuffer(&(nb->atdat->fShift));
+    freeDeviceBuffer(&(nb->atdat->ljComb));
+    freeDeviceBuffer(&(nb->atdat->atomTypes));
+    freeDeviceBuffer(&(nb->atdat->shiftVec));
     sfree(nb->atdat);
 
     /* Free nbparam */
@@ -624,14 +624,14 @@ void gpu_free(NbnxmGpu* nb)
     }
 
     /* Free nbst */
-    pfree(nb->nbst.e_lj);
-    nb->nbst.e_lj = nullptr;
+    pfree(nb->nbst.eLJ);
+    nb->nbst.eLJ = nullptr;
 
-    pfree(nb->nbst.e_el);
-    nb->nbst.e_el = nullptr;
+    pfree(nb->nbst.eElec);
+    nb->nbst.eElec = nullptr;
 
-    pfree(nb->nbst.fshift);
-    nb->nbst.fshift = nullptr;
+    pfree(nb->nbst.fShift);
+    nb->nbst.fShift = nullptr;
 
     freeGpuProgram(nb->dev_rundata->program);
     delete nb->dev_rundata;
diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
index 751e352962..474d90700d 100644
--- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
+++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h
@@ -74,62 +74,6 @@ enum ePruneKind
     ePruneNR
 };
 
-/*! \internal
- * \brief Staging area for temporary data downloaded from the GPU.
- *
- *  The energies/shift forces get downloaded here first, before getting added
- *  to the CPU-side aggregate values.
- */
-struct nb_staging_t
-{
-    //! LJ energy
-    float* e_lj = nullptr;
-    //! electrostatic energy
-    float* e_el = nullptr;
-    //! float3 buffer with shift forces
-    Float3* fshift = nullptr;
-};
-
-/*! \internal
- * \brief Nonbonded atom data - both inputs and outputs.
- */
-typedef struct cl_atomdata
-{
-    //! number of atoms
-    int natoms;
-    //! number of local atoms
-    int natoms_local;
-    //! allocation size for the atom data (xq, f)
-    int nalloc;
-
-    //! float4 buffer with atom coordinates + charges, size natoms
-    DeviceBuffer<Float4> xq;
-
-    //! float3 buffer with force output array, size natoms
-    DeviceBuffer<Float3> f;
-
-    //! LJ energy output, size 1
-    DeviceBuffer<float> e_lj;
-    //! Electrostatics energy input, size 1
-    DeviceBuffer<float> e_el;
-
-    //! float3 buffer with shift forces
-    DeviceBuffer<Float3> fshift;
-
-    //! number of atom types
-    int ntypes;
-    //! int buffer with atom type indices, size natoms
-    DeviceBuffer<int> atom_types;
-    //! float2 buffer with sqrt(c6),sqrt(c12), size natoms
-    DeviceBuffer<Float2> lj_comb;
-
-    //! float3 buffer with shifts values
-    DeviceBuffer<Float3> shift_vec;
-
-    //! true if the shift vector has been uploaded
-    bool bShiftVecUploaded;
-} cl_atomdata_t;
-
 /*! \internal
  * \brief Data structure shared between the OpenCL device code and OpenCL host code
  *
@@ -229,13 +173,13 @@ struct NbnxmGpu
     bool bNonLocalStreamDoneMarked = false;
 
     //! atom data
-    cl_atomdata_t* atdat = nullptr;
+    NBAtomData* atdat = nullptr;
     //! parameters required for the non-bonded calc.
     NBParamGpu* nbparam = nullptr;
     //! pair-list data structures (local and non-local)
     gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { nullptr };
     //! staging area where fshift/energies get downloaded
-    nb_staging_t nbst;
+    NBStagingData nbst;
 
     //! local and non-local GPU queues
     gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
index c313deb110..60ba8b32f4 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp
@@ -87,7 +87,7 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
                "beginning of the copy back function.");
 
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
-    sycl_atomdata_t*    adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
 
     /* don't launch non-local copy-back if there was no non-local work to do */
     if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc))
@@ -138,24 +138,24 @@ void gpu_launch_cpyback(NbnxmGpu*                nb,
         /* DtoH fshift when virial is needed */
         if (stepWork.computeVirial)
         {
-            GMX_ASSERT(sizeof(*nb->nbst.fshift) == adat->fShift.elementSize(),
+            GMX_ASSERT(sizeof(*nb->nbst.fShift) == adat->fShift.elementSize(),
                        "Sizes of host- and device-side shift vector elements should be the same.");
             copyFromDeviceBuffer(
-                    nb->nbst.fshift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
+                    nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr);
         }
 
         /* DtoH energies */
         if (stepWork.computeEnergy)
         {
-            GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float),
+            GMX_ASSERT(sizeof(*nb->nbst.eLJ) == sizeof(float),
                        "Sizes of host- and device-side LJ energy terms should be the same.");
             copyFromDeviceBuffer(
-                    nb->nbst.e_lj, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
-            GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float),
+                    nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+            GMX_ASSERT(sizeof(*nb->nbst.eElec) == sizeof(float),
                        "Sizes of host- and device-side electrostatic energy terms should be the "
                        "same.");
             copyFromDeviceBuffer(
-                    nb->nbst.e_el, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
+                    nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr);
         }
     }
 }
@@ -168,7 +168,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom
 
     const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality);
 
-    sycl_atomdata_t*    adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     gpu_plist*          plist        = nb->plist[iloc];
     const DeviceStream& deviceStream = *nb->deviceStreams[iloc];
 
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
index e033a507e8..0f67f04f0a 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp
@@ -59,10 +59,10 @@ namespace Nbnxm
 //! This function is documented in the header file
 void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 {
-    sycl_atomdata_t*    adat        = nb->atdat;
+    NBAtomData*         adat        = nb->atdat;
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
     // Clear forces
-    clearDeviceBufferAsync(&adat->f, 0, nb->atdat->natoms, localStream);
+    clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream);
     // Clear shift force array and energies if the outputs were used in the current step
     if (computeVirial)
     {
@@ -76,7 +76,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial)
 static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext)
 {
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
-    sycl_atomdata_t*    atomdata    = nb->atdat;
+    NBAtomData*         atomdata    = nb->atdat;
     atomdata->numTypes              = numTypes;
     allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext);
     atomdata->shiftVecUploaded = false;
@@ -95,8 +95,8 @@ static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& d
     atomdata->f  = nullptr;
 
     /* size -1 indicates that the respective array hasn't been initialized yet */
-    atomdata->natoms   = -1;
-    atomdata->numAlloc = -1;
+    atomdata->numAtoms      = -1;
+    atomdata->numAtomsAlloc = -1;
 }
 
 /*! \brief Initialize the nonbonded parameter data structure. */
@@ -144,7 +144,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 {
     auto* nb                              = new NbnxmGpu();
     nb->deviceContext_                    = &deviceStreamManager.context();
-    nb->atdat                             = new sycl_atomdata_t;
+    nb->atdat                             = new NBAtomData;
     nb->nbparam                           = new NBParamGpu;
     nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist;
     if (bLocalAndNonlocal)
@@ -158,9 +158,9 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
     nb->timings = nullptr;
 
     /* init nbst */
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el));
-    pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.eElec), sizeof(*nb->nbst.eElec));
+    pmalloc(reinterpret_cast<void**>(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift));
 
     init_plist(nb->plist[InteractionLocality::Local]);
 
@@ -195,7 +195,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager,
 
 void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 {
-    sycl_atomdata_t*    adat        = nb->atdat;
+    NBAtomData*         adat        = nb->atdat;
     const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local];
 
     /* only if we have a dynamic box */
@@ -217,18 +217,18 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom)
 void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
 {
     GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet");
-    sycl_atomdata_t*     atdat         = nb->atdat;
+    NBAtomData*          atdat         = nb->atdat;
     const DeviceContext& deviceContext = *nb->deviceContext_;
     const DeviceStream&  localStream   = *nb->deviceStreams[InteractionLocality::Local];
 
     int  numAtoms    = nbat->numAtoms();
     bool reallocated = false;
-    if (numAtoms > atdat->numAlloc)
+    if (numAtoms > atdat->numAtomsAlloc)
     {
         int numAlloc = over_alloc_small(numAtoms);
 
         /* free up first if the arrays have already been initialized */
-        if (atdat->numAlloc != -1)
+        if (atdat->numAtomsAlloc != -1)
         {
             freeDeviceBuffer(&atdat->f);
             freeDeviceBuffer(&atdat->xq);
@@ -247,17 +247,17 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat)
             allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext);
         }
 
-        atdat->numAlloc = numAlloc;
-        reallocated     = true;
+        atdat->numAtomsAlloc = numAlloc;
+        reallocated          = true;
     }
 
-    atdat->natoms       = numAtoms;
-    atdat->natoms_local = nbat->natoms_local;
+    atdat->numAtoms      = numAtoms;
+    atdat->numAtomsLocal = nbat->natoms_local;
 
     /* need to clear GPU f output if realloc happened */
     if (reallocated)
     {
-        clearDeviceBufferAsync(&atdat->f, 0, atdat->numAlloc, localStream);
+        clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream);
     }
 
     if (useLjCombRule(nb->nbparam->vdwType))
@@ -293,8 +293,8 @@ void gpu_free(NbnxmGpu* nb)
         return;
     }
 
-    sycl_atomdata_t* atdat   = nb->atdat;
-    NBParamGpu*      nbparam = nb->nbparam;
+    NBAtomData* atdat   = nb->atdat;
+    NBParamGpu* nbparam = nb->nbparam;
 
     if ((!nbparam->coulomb_tab)
         && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin))
@@ -322,14 +322,14 @@ void gpu_free(NbnxmGpu* nb)
     }
 
     /* Free nbst */
-    pfree(nb->nbst.e_lj);
-    nb->nbst.e_lj = nullptr;
+    pfree(nb->nbst.eLJ);
+    nb->nbst.eLJ = nullptr;
 
-    pfree(nb->nbst.e_el);
-    nb->nbst.e_el = nullptr;
+    pfree(nb->nbst.eElec);
+    nb->nbst.eElec = nullptr;
 
-    pfree(nb->nbst.fshift);
-    nb->nbst.fshift = nullptr;
+    pfree(nb->nbst.fShift);
+    nb->nbst.fShift = nullptr;
 
     delete atdat;
     delete nbparam;
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
index 946eb2dd4d..770732c70c 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp
@@ -962,7 +962,7 @@ cl::sycl::event chooseAndLaunchNbnxmKernel(bool          doPruneNBL,
 
 void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc)
 {
-    sycl_atomdata_t*    adat         = nb->atdat;
+    NBAtomData*         adat         = nb->atdat;
     NBParamGpu*         nbp          = nb->nbparam;
     gpu_plist*          plist        = nb->plist[iloc];
     const bool          doPruneNBL   = (plist->haveFreshList && !nb->didPrune[iloc]);
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
index cf9ce2f67d..62f4bb7592 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp
@@ -264,7 +264,7 @@ void launchNbnxmKernelPruneOnly(NbnxmGpu*                 nb,
                                 const int                 part,
                                 const int                 numSciInPart)
 {
-    sycl_atomdata_t*    adat          = nb->atdat;
+    NBAtomData*         adat          = nb->atdat;
     NBParamGpu*         nbp           = nb->nbparam;
     gpu_plist*          plist         = nb->plist[iloc];
     const bool          haveFreshList = plist->haveFreshList;
diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
index ba14d9b867..c1e23c1a74 100644
--- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
+++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h
@@ -55,60 +55,6 @@
 #include "gromacs/timing/gpu_timing.h"
 #include "gromacs/utility/enumerationhelpers.h"
 
-/*! \internal
- * \brief Staging area for temporary data downloaded from the GPU.
- *
- * Since SYCL buffers already have host-side storage, this is a bit redundant.
- * But it allows prefetching of the data from GPU, and brings GPU backends closer together.
- */
-struct nb_staging_t
-{
-    //! LJ energy
-    float* e_lj = nullptr;
-    //! electrostatic energy
-    float* e_el = nullptr;
-    //! shift forces
-    Float3* fshift = nullptr;
-};
-
-/** \internal
- * \brief Nonbonded atom data - both inputs and outputs.
- */
-struct sycl_atomdata_t
-{
-    //! number of atoms
-    int natoms;
-    //! number of local atoms
-    int natoms_local; //
-    //! allocation size for the atom data (xq, f)
-    int numAlloc;
-
-    //! atom coordinates + charges, size \ref natoms
-    DeviceBuffer<Float4> xq;
-    //! force output array, size \ref natoms
-    DeviceBuffer<Float3> f;
-
-    //! LJ energy output, size 1
-    DeviceBuffer<float> eLJ;
-    //! Electrostatics energy input, size 1
-    DeviceBuffer<float> eElec;
-
-    //! shift forces
-    DeviceBuffer<Float3> fShift;
-
-    //! number of atom types
-    int numTypes;
-    //! atom type indices, size \ref natoms
-    DeviceBuffer<int> atomTypes;
-    //! sqrt(c6),sqrt(c12) size \ref natoms
-    DeviceBuffer<Float2> ljComb;
-
-    //! shifts
-    DeviceBuffer<Float3> shiftVec;
-    //! true if the shift vector has been uploaded
-    bool shiftVecUploaded;
-};
-
 class GpuEventSynchronizer;
 
 /*! \internal
@@ -126,13 +72,13 @@ struct NbnxmGpu
     /*! \brief true indicates that the nonlocal_done event was marked */
     bool bNonLocalStreamDoneMarked = false;
     /*! \brief atom data */
-    sycl_atomdata_t* atdat = nullptr;
+    NBAtomData* atdat = nullptr;
 
     NBParamGpu* nbparam = nullptr;
     /*! \brief pair-list data structures (local and non-local) */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, Nbnxm::gpu_plist*> plist = { { nullptr } };
     /*! \brief staging area where fshift/energies get downloaded. Will be removed in SYCL. */
-    nb_staging_t nbst;
+    NBStagingData nbst;
     /*! \brief local and non-local GPU streams */
     gmx::EnumerationArray<Nbnxm::InteractionLocality, const DeviceStream*> deviceStreams;
 
-- 
2.22.0