From cf8c4835899db2eab12091c5054a9c5b6c7081e4 Mon Sep 17 00:00:00 2001 From: Artem Zhmurov Date: Mon, 22 Feb 2021 14:15:16 +0000 Subject: [PATCH] Unify NB atoms and staging data structures in OpenCL, CUDA and SYCL Refs #2608 --- src/gromacs/nbnxm/cuda/nbnxm_cuda.cu | 30 ++--- .../nbnxm/cuda/nbnxm_cuda_data_mgmt.cu | 109 +++++++++--------- src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh | 18 +-- .../nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu | 6 +- .../cuda/nbnxm_cuda_kernel_pruneonly.cuh | 8 +- src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h | 65 +---------- src/gromacs/nbnxm/gpu_common.h | 15 ++- src/gromacs/nbnxm/gpu_types_common.h | 54 +++++++++ src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp | 56 ++++----- .../nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp | 96 +++++++-------- src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h | 60 +--------- src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp | 16 +-- .../nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp | 52 ++++----- src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp | 2 +- .../sycl/nbnxm_sycl_kernel_pruneonly.cpp | 2 +- src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h | 58 +--------- 16 files changed, 263 insertions(+), 384 deletions(-) diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu index 62d50f039d..594c4ca291 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda.cu @@ -122,7 +122,7 @@ namespace Nbnxm constexpr static int c_bufOpsThreadsPerBlock = 128; /*! Nonbonded kernel function pointer type */ -typedef void (*nbnxn_cu_kfunc_ptr_t)(const cu_atomdata_t, const NBParamGpu, const gpu_plist, bool); +typedef void (*nbnxn_cu_kfunc_ptr_t)(const NBAtomData, const NBParamGpu, const gpu_plist, bool); /*********************************/ @@ -456,7 +456,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom int adat_begin, adat_len; /* local/nonlocal offset and length used for xq and f */ - cu_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; gpu_plist* plist = nb->plist[iloc]; cu_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -488,12 +488,12 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom if (atomLocality == AtomLocality::Local) { adat_begin = 0; - adat_len = adat->natoms_local; + adat_len = adat->numAtomsLocal; } else { - adat_begin = adat->natoms_local; - adat_len = adat->natoms - adat->natoms_local; + adat_begin = adat->numAtomsLocal; + adat_len = adat->numAtoms - adat->numAtomsLocal; } /* beginning of timed HtoD section */ @@ -546,7 +546,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom */ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc) { - cu_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; cu_timers_t* t = nb->timers; @@ -667,7 +667,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z) void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { - cu_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; cu_timers_t* t = nb->timers; @@ -811,7 +811,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, "beginning of the copy back function."); /* extract the data */ - cu_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; cu_timers_t* t = nb->timers; bool bDoTime = nb->bDoTime; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -872,24 +872,24 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH fshift when virial is needed */ if (stepWork.computeVirial) { - static_assert(sizeof(nb->nbst.fshift[0]) == sizeof(adat->fshift[0]), + static_assert(sizeof(nb->nbst.fShift[0]) == sizeof(adat->fShift[0]), "Sizes of host- and device-side shift vectors should be the same."); copyFromDeviceBuffer( - nb->nbst.fshift, &adat->fshift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr); + nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr); } /* DtoH energies */ if (stepWork.computeEnergy) { - static_assert(sizeof(nb->nbst.e_lj[0]) == sizeof(adat->e_lj[0]), + static_assert(sizeof(nb->nbst.eLJ[0]) == sizeof(adat->eLJ[0]), "Sizes of host- and device-side LJ energy terms should be the same."); copyFromDeviceBuffer( - nb->nbst.e_lj, &adat->e_lj, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); - static_assert(sizeof(nb->nbst.e_el[0]) == sizeof(adat->e_el[0]), + nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); + static_assert(sizeof(nb->nbst.eElec[0]) == sizeof(adat->eElec[0]), "Sizes of host- and device-side electrostatic energy terms should be the " "same."); copyFromDeviceBuffer( - nb->nbst.e_el, &adat->e_el, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); + nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); } } @@ -929,7 +929,7 @@ void nbnxn_gpu_x_to_nbat_x(const Nbnxm::Grid& grid, { GMX_ASSERT(nb, "Need a valid nbnxn_gpu object"); - cu_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; const int numColumns = grid.numColumns(); const int cellOffset = grid.cellOffset(); diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu index db8654440e..2505422927 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_data_mgmt.cu @@ -97,15 +97,15 @@ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb); /*! Initializes the atomdata structure first time, it only gets filled at pair-search. */ -static void init_atomdata_first(cu_atomdata_t* ad, int ntypes, const DeviceContext& deviceContext) +static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext) { - ad->ntypes = ntypes; - allocateDeviceBuffer(&ad->shift_vec, SHIFTS, deviceContext); - ad->bShiftVecUploaded = false; + ad->numTypes = ntypes; + allocateDeviceBuffer(&ad->shiftVec, SHIFTS, deviceContext); + ad->shiftVecUploaded = false; - allocateDeviceBuffer(&ad->fshift, SHIFTS, deviceContext); - allocateDeviceBuffer(&ad->e_lj, 1, deviceContext); - allocateDeviceBuffer(&ad->e_el, 1, deviceContext); + allocateDeviceBuffer(&ad->fShift, SHIFTS, deviceContext); + allocateDeviceBuffer(&ad->eLJ, 1, deviceContext); + allocateDeviceBuffer(&ad->eElec, 1, deviceContext); /* initialize to nullptr poiters to data that is not allocated here and will need reallocation in nbnxn_cuda_init_atomdata */ @@ -113,8 +113,8 @@ static void init_atomdata_first(cu_atomdata_t* ad, int ntypes, const DeviceConte ad->f = nullptr; /* size -1 indicates that the respective array hasn't been initialized yet */ - ad->natoms = -1; - ad->nalloc = -1; + ad->numAtoms = -1; + ad->numAtomsAlloc = -1; } /*! Initializes the nonbonded parameter data structure. */ @@ -198,9 +198,9 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, snew(nb->timings, 1); /* init nbst */ - pmalloc((void**)&nb->nbst.e_lj, sizeof(*nb->nbst.e_lj)); - pmalloc((void**)&nb->nbst.e_el, sizeof(*nb->nbst.e_el)); - pmalloc((void**)&nb->nbst.fshift, SHIFTS * sizeof(*nb->nbst.fshift)); + pmalloc((void**)&nb->nbst.eLJ, sizeof(*nb->nbst.eLJ)); + pmalloc((void**)&nb->nbst.eElec, sizeof(*nb->nbst.eElec)); + pmalloc((void**)&nb->nbst.fShift, SHIFTS * sizeof(*nb->nbst.fShift)); init_plist(nb->plist[InteractionLocality::Local]); @@ -259,29 +259,29 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { - cu_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; /* only if we have a dynamic box */ - if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) + if (nbatom->bDynamicBox || !adat->shiftVecUploaded) { - static_assert(sizeof(adat->shift_vec[0]) == sizeof(nbatom->shift_vec[0]), + static_assert(sizeof(adat->shiftVec[0]) == sizeof(nbatom->shift_vec[0]), "Sizes of host- and device-side shift vectors should be the same."); - copyToDeviceBuffer(&adat->shift_vec, + copyToDeviceBuffer(&adat->shiftVec, reinterpret_cast(nbatom->shift_vec.data()), 0, SHIFTS, localStream, GpuApiCallBehavior::Async, nullptr); - adat->bShiftVecUploaded = true; + adat->shiftVecUploaded = true; } } /*! Clears the first natoms_clear elements of the GPU nonbonded force output array. */ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear) { - cu_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; clearDeviceBufferAsync(&adat->f, 0, natoms_clear, localStream); } @@ -289,17 +289,17 @@ static void nbnxn_cuda_clear_f(NbnxmGpu* nb, int natoms_clear) /*! Clears nonbonded shift force output array and energy outputs on the GPU. */ static void nbnxn_cuda_clear_e_fshift(NbnxmGpu* nb) { - cu_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - clearDeviceBufferAsync(&adat->fshift, 0, SHIFTS, localStream); - clearDeviceBufferAsync(&adat->e_lj, 0, 1, localStream); - clearDeviceBufferAsync(&adat->e_el, 0, 1, localStream); + clearDeviceBufferAsync(&adat->fShift, 0, SHIFTS, localStream); + clearDeviceBufferAsync(&adat->eLJ, 0, 1, localStream); + clearDeviceBufferAsync(&adat->eElec, 0, 1, localStream); } void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) { - nbnxn_cuda_clear_f(nb, nb->atdat->natoms); + nbnxn_cuda_clear_f(nb, nb->atdat->numAtoms); /* clear shift force array and energies if the outputs were used in the current step */ if (computeVirial) @@ -314,7 +314,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) bool realloced; bool bDoTime = nb->bDoTime; cu_timers_t* timers = nb->timers; - cu_atomdata_t* d_atdat = nb->atdat; + NBAtomData* d_atdat = nb->atdat; const DeviceContext& deviceContext = *nb->deviceContext_; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; @@ -329,36 +329,36 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) /* need to reallocate if we have to copy more atoms than the amount of space available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */ - if (natoms > d_atdat->nalloc) + if (natoms > d_atdat->numAtomsAlloc) { nalloc = over_alloc_small(natoms); /* free up first if the arrays have already been initialized */ - if (d_atdat->nalloc != -1) + if (d_atdat->numAtomsAlloc != -1) { freeDeviceBuffer(&d_atdat->f); freeDeviceBuffer(&d_atdat->xq); - freeDeviceBuffer(&d_atdat->atom_types); - freeDeviceBuffer(&d_atdat->lj_comb); + freeDeviceBuffer(&d_atdat->atomTypes); + freeDeviceBuffer(&d_atdat->ljComb); } allocateDeviceBuffer(&d_atdat->f, nalloc, deviceContext); allocateDeviceBuffer(&d_atdat->xq, nalloc, deviceContext); if (useLjCombRule(nb->nbparam->vdwType)) { - allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext); + allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext); } else { - allocateDeviceBuffer(&d_atdat->atom_types, nalloc, deviceContext); + allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext); } - d_atdat->nalloc = nalloc; - realloced = true; + d_atdat->numAtomsAlloc = nalloc; + realloced = true; } - d_atdat->natoms = natoms; - d_atdat->natoms_local = nbat->natoms_local; + d_atdat->numAtoms = natoms; + d_atdat->numAtomsLocal = nbat->natoms_local; /* need to clear GPU f output if realloc happened */ if (realloced) @@ -368,9 +368,9 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam->vdwType)) { - static_assert(sizeof(d_atdat->lj_comb[0]) == sizeof(Float2), + static_assert(sizeof(d_atdat->ljComb[0]) == sizeof(Float2), "Size of the LJ parameters element should be equal to the size of float2."); - copyToDeviceBuffer(&d_atdat->lj_comb, + copyToDeviceBuffer(&d_atdat->ljComb, reinterpret_cast(nbat->params().lj_comb.data()), 0, natoms, @@ -380,9 +380,9 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) } else { - static_assert(sizeof(d_atdat->atom_types[0]) == sizeof(nbat->params().type[0]), + static_assert(sizeof(d_atdat->atomTypes[0]) == sizeof(nbat->params().type[0]), "Sizes of host- and device-side atom types should be the same."); - copyToDeviceBuffer(&d_atdat->atom_types, + copyToDeviceBuffer(&d_atdat->atomTypes, nbat->params().type.data(), 0, natoms, @@ -399,16 +399,13 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) void gpu_free(NbnxmGpu* nb) { - cu_atomdata_t* atdat; - NBParamGpu* nbparam; - if (nb == nullptr) { return; } - atdat = nb->atdat; - nbparam = nb->nbparam; + NBAtomData* atdat = nb->atdat; + NBParamGpu* nbparam = nb->nbparam; if ((!nbparam->coulomb_tab) && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)) @@ -428,16 +425,16 @@ void gpu_free(NbnxmGpu* nb) destroyParamLookupTable(&nbparam->nbfp_comb, nbparam->nbfp_comb_texobj); } - freeDeviceBuffer(&atdat->shift_vec); - freeDeviceBuffer(&atdat->fshift); + freeDeviceBuffer(&atdat->shiftVec); + freeDeviceBuffer(&atdat->fShift); - freeDeviceBuffer(&atdat->e_lj); - freeDeviceBuffer(&atdat->e_el); + freeDeviceBuffer(&atdat->eLJ); + freeDeviceBuffer(&atdat->eElec); freeDeviceBuffer(&atdat->f); freeDeviceBuffer(&atdat->xq); - freeDeviceBuffer(&atdat->atom_types); - freeDeviceBuffer(&atdat->lj_comb); + freeDeviceBuffer(&atdat->atomTypes); + freeDeviceBuffer(&atdat->ljComb); /* Free plist */ auto* plist = nb->plist[InteractionLocality::Local]; @@ -457,14 +454,14 @@ void gpu_free(NbnxmGpu* nb) } /* Free nbst */ - pfree(nb->nbst.e_lj); - nb->nbst.e_lj = nullptr; + pfree(nb->nbst.eLJ); + nb->nbst.eLJ = nullptr; - pfree(nb->nbst.e_el); - nb->nbst.e_el = nullptr; + pfree(nb->nbst.eElec); + nb->nbst.eElec = nullptr; - pfree(nb->nbst.fshift); - nb->nbst.fshift = nullptr; + pfree(nb->nbst.fShift); + nb->nbst.fShift = nullptr; sfree(atdat); sfree(nbparam); @@ -501,7 +498,7 @@ DeviceBuffer gpu_get_fshift(NbnxmGpu* nb) { assert(nb); - return reinterpret_cast>(nb->atdat->fshift); + return reinterpret_cast>(nb->atdat->fShift); } /* Initialization for X buffer operations on GPU. */ diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh index 688e094715..344e971c84 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel.cuh @@ -159,7 +159,7 @@ __launch_bounds__(THREADS_PER_BLOCK) __global__ void NB_KERNEL_FUNC_NAME(nbnxn_kernel, _F_cuda) # endif /* CALC_ENERGIES */ #endif /* PRUNE_NBL */ - (const cu_atomdata_t atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift) + (const NBAtomData atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, bool bCalcFshift) #ifdef FUNCTION_DECLARATION_ONLY ; /* Only do function declaration, omit the function body. */ #else @@ -172,15 +172,15 @@ __launch_bounds__(THREADS_PER_BLOCK) nbnxn_cj4_t* pl_cj4 = plist.cj4; const nbnxn_excl_t* excl = plist.excl; # ifndef LJ_COMB - const int* atom_types = atdat.atom_types; - int ntypes = atdat.ntypes; + const int* atom_types = atdat.atomTypes; + int ntypes = atdat.numTypes; # else - const float2* lj_comb = atdat.lj_comb; + const float2* lj_comb = atdat.ljComb; float2 ljcp_i, ljcp_j; # endif const float4* xq = atdat.xq; float3* f = asFloat3(atdat.f); - const float3* shift_vec = asFloat3(atdat.shift_vec); + const float3* shift_vec = asFloat3(atdat.shiftVec); float rcoulomb_sq = nbparam.rcoulomb_sq; # ifdef VDW_CUTOFF_CHECK float rvdw_sq = nbparam.rvdw_sq; @@ -207,8 +207,8 @@ __launch_bounds__(THREADS_PER_BLOCK) # else float reactionFieldShift = nbparam.c_rf; # endif /* EL_EWALD_ANY */ - float* e_lj = atdat.e_lj; - float* e_el = atdat.e_el; + float* e_lj = atdat.eLJ; + float* e_el = atdat.eElec; # endif /* CALC_ENERGIES */ /* thread/block/warp id-s */ @@ -649,8 +649,8 @@ __launch_bounds__(THREADS_PER_BLOCK) /* add up local shift forces into global mem, tidxj indexes x,y,z */ if (bCalcFshift && (tidxj & 3) < 3) { - float3* fshift = asFloat3(atdat.fshift); - atomicAdd(&(fshift[nb_sci.shift].x) + (tidxj & 3), fshift_buf); + float3* fShift = asFloat3(atdat.fShift); + atomicAdd(&(fShift[nb_sci.shift].x) + (tidxj & 3), fshift_buf); } # ifdef CALC_ENERGIES diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu index fb8ebb2e76..2ff980a6b0 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cu @@ -1,7 +1,7 @@ /* * This file is part of the GROMACS molecular simulation package. * - * Copyright (c) 2016,2017,2019,2020, by the GROMACS development team, led by + * Copyright (c) 2016,2017,2019,2020,2021, by the GROMACS development team, led by * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl, * and including many others, as listed in the AUTHORS file in the * top-level source directory and at http://www.gromacs.org. @@ -39,7 +39,7 @@ #ifndef FUNCTION_DECLARATION_ONLY /* Instantiate external template functions */ template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int); +nbnxn_kernel_prune_cuda(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int); template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int); +nbnxn_kernel_prune_cuda(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int); #endif diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh index 11c51227f4..c5c55e667f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_kernel_pruneonly.cuh @@ -104,7 +104,7 @@ */ template __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__ - void nbnxn_kernel_prune_cuda(const cu_atomdata_t atdat, + void nbnxn_kernel_prune_cuda(const NBAtomData atdat, const NBParamGpu nbparam, const Nbnxm::gpu_plist plist, int numParts, @@ -115,9 +115,9 @@ __launch_bounds__(THREADS_PER_BLOCK, MIN_BLOCKS_PER_MP) __global__ // Add extern declarations so each translation unit understands that // there will be a definition provided. extern template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int); +nbnxn_kernel_prune_cuda(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int); extern template __global__ void -nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnxm::gpu_plist, int, int); +nbnxn_kernel_prune_cuda(const NBAtomData, const NBParamGpu, const Nbnxm::gpu_plist, int, int); #else { @@ -125,7 +125,7 @@ nbnxn_kernel_prune_cuda(const cu_atomdata_t, const NBParamGpu, const Nbnx const nbnxn_sci_t* pl_sci = plist.sci; nbnxn_cj4_t* pl_cj4 = plist.cj4; const float4* xq = atdat.xq; - const float3* shift_vec = asFloat3(atdat.shift_vec); + const float3* shift_vec = asFloat3(atdat.shiftVec); float rlistOuter_sq = nbparam.rlistOuter_sq; float rlistInner_sq = nbparam.rlistInner_sq; diff --git a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h index f1b1a6db81..08d96de90f 100644 --- a/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h +++ b/src/gromacs/nbnxm/cuda/nbnxm_cuda_types.h @@ -64,67 +64,6 @@ /*! \brief cluster size = number of atoms per cluster. */ static constexpr int c_clSize = c_nbnxnGpuClusterSize; -/* All structs prefixed with "cu_" hold data used in GPU calculations and - * are passed to the kernels, except cu_timers_t. */ -/*! \cond */ -typedef struct cu_atomdata cu_atomdata_t; -/*! \endcond */ - - -/** \internal - * \brief Staging area for temporary data downloaded from the GPU. - * - * The energies/shift forces get downloaded here first, before getting added - * to the CPU-side aggregate values. - */ -struct nb_staging_t -{ - //! LJ energy - float* e_lj = nullptr; - //! electrostatic energy - float* e_el = nullptr; - //! shift forces - Float3* fshift = nullptr; -}; - -/** \internal - * \brief Nonbonded atom data - both inputs and outputs. - */ -struct cu_atomdata -{ - //! number of atoms - int natoms; - //! number of local atoms - int natoms_local; - //! allocation size for the atom data (xq, f) - int nalloc; - - //! atom coordinates + charges, size natoms - DeviceBuffer xq; - //! force output array, size natoms - DeviceBuffer f; - - //! LJ energy output, size 1 - DeviceBuffer e_lj; - //! Electrostatics energy input, size 1 - DeviceBuffer e_el; - - //! shift forces - DeviceBuffer fshift; - - //! number of atom types - int ntypes; - //! atom type indices, size natoms - DeviceBuffer atom_types; - //! sqrt(c6),sqrt(c12) size natoms - DeviceBuffer lj_comb; - - //! shifts - DeviceBuffer shift_vec; - //! true if the shift vector has been uploaded - bool bShiftVecUploaded; -}; - /** \internal * \brief Typedef of actual timer type. */ @@ -146,7 +85,7 @@ struct NbnxmGpu bool bNonLocalStreamDoneMarked = false; /*! \brief atom data */ - cu_atomdata_t* atdat = nullptr; + NBAtomData* atdat = nullptr; /*! \brief array of atom indices */ int* atomIndices = nullptr; /*! \brief size of atom indices */ @@ -170,7 +109,7 @@ struct NbnxmGpu /*! \brief pair-list data structures (local and non-local) */ gmx::EnumerationArray plist = { { nullptr } }; /*! \brief staging area where fshift/energies get downloaded */ - nb_staging_t nbst; + NBStagingData nbst; /*! \brief local and non-local GPU streams */ gmx::EnumerationArray deviceStreams; diff --git a/src/gromacs/nbnxm/gpu_common.h b/src/gromacs/nbnxm/gpu_common.h index bb1eb874ae..dccfe1eed0 100644 --- a/src/gromacs/nbnxm/gpu_common.h +++ b/src/gromacs/nbnxm/gpu_common.h @@ -183,12 +183,12 @@ static inline void getGpuAtomRange(const AtomDataT* atomData, if (atomLocality == AtomLocality::Local) { *atomRangeBegin = 0; - *atomRangeLen = atomData->natoms_local; + *atomRangeLen = atomData->numAtomsLocal; } else { - *atomRangeBegin = atomData->natoms_local; - *atomRangeLen = atomData->natoms - atomData->natoms_local; + *atomRangeBegin = atomData->numAtomsLocal; + *atomRangeLen = atomData->numAtoms - atomData->numAtomsLocal; } } @@ -241,7 +241,6 @@ static void countPruneKernelTime(GpuTimers* timers, * Note that this function should always be called after the transfers into the * staging buffers has completed. * - * \tparam StagingData Type of staging data * \param[in] nbst Nonbonded staging data * \param[in] iLocality Interaction locality specifier * \param[in] reduceEnergies True if energy reduction should be done @@ -250,7 +249,7 @@ static void countPruneKernelTime(GpuTimers* timers, * \param[out] e_el Variable to accumulate electrostatic energy into * \param[out] fshift Pointer to the array of shift forces to accumulate into */ -static inline void gpu_reduce_staged_outputs(const nb_staging_t& nbst, +static inline void gpu_reduce_staged_outputs(const NBStagingData& nbst, const InteractionLocality iLocality, const bool reduceEnergies, const bool reduceFshift, @@ -263,15 +262,15 @@ static inline void gpu_reduce_staged_outputs(const nb_staging_t& nbst, { if (reduceEnergies) { - *e_lj += *nbst.e_lj; - *e_el += *nbst.e_el; + *e_lj += *nbst.eLJ; + *e_el += *nbst.eElec; } if (reduceFshift) { for (int i = 0; i < SHIFTS; i++) { - rvec_inc(fshift[i], nbst.fshift[i]); + rvec_inc(fshift[i], nbst.fShift[i]); } } } diff --git a/src/gromacs/nbnxm/gpu_types_common.h b/src/gromacs/nbnxm/gpu_types_common.h index ebd5db9b34..85c5853ebd 100644 --- a/src/gromacs/nbnxm/gpu_types_common.h +++ b/src/gromacs/nbnxm/gpu_types_common.h @@ -73,6 +73,60 @@ //! Default for the prune kernel's j4 processing concurrency. static constexpr int c_pruneKernelJ4Concurrency = GMX_NBNXN_PRUNE_KERNEL_J4_CONCURRENCY; +/*! \internal + * \brief Staging area for temporary data downloaded from the GPU. + * + * Since SYCL buffers already have host-side storage, this is a bit redundant. + * But it allows prefetching of the data from GPU, and brings GPU backends closer together. + */ +struct NBStagingData +{ + //! LJ energy + float* eLJ = nullptr; + //! electrostatic energy + float* eElec = nullptr; + //! shift forces + Float3* fShift = nullptr; +}; + +/** \internal + * \brief Nonbonded atom data - both inputs and outputs. + */ +struct NBAtomData +{ + //! number of atoms + int numAtoms; + //! number of local atoms + int numAtomsLocal; + //! allocation size for the atom data (xq, f) + int numAtomsAlloc; + + //! atom coordinates + charges, size \ref numAtoms + DeviceBuffer xq; + //! force output array, size \ref numAtoms + DeviceBuffer f; + + //! LJ energy output, size 1 + DeviceBuffer eLJ; + //! Electrostatics energy input, size 1 + DeviceBuffer eElec; + + //! shift forces + DeviceBuffer fShift; + + //! number of atom types + int numTypes; + //! atom type indices, size \ref numAtoms + DeviceBuffer atomTypes; + //! sqrt(c6),sqrt(c12) size \ref numAtoms + DeviceBuffer ljComb; + + //! shifts + DeviceBuffer shiftVec; + //! true if the shift vector has been uploaded + bool shiftVecUploaded; +}; + /** \internal * \brief Parameters required for the GPU nonbonded calculations. */ diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp index 6cdad01019..e00874a30b 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl.cpp @@ -530,7 +530,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom /* local/nonlocal offset and length used for xq and f */ int adat_begin, adat_len; - cl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; gpu_plist* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -562,12 +562,12 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom if (atomLocality == AtomLocality::Local) { adat_begin = 0; - adat_len = adat->natoms_local; + adat_len = adat->numAtomsLocal; } else { - adat_begin = adat->natoms_local; - adat_len = adat->natoms - adat->natoms_local; + adat_begin = adat->numAtomsLocal; + adat_len = adat->numAtoms - adat->numAtomsLocal; } /* beginning of timed HtoD section */ @@ -622,7 +622,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom */ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nbnxm::InteractionLocality iloc) { - cl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; @@ -717,11 +717,11 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb &nbparams_params, &adat->xq, &adat->f, - &adat->e_lj, - &adat->e_el, - &adat->fshift, - &adat->lj_comb, - &adat->shift_vec, + &adat->eLJ, + &adat->eElec, + &adat->fShift, + &adat->ljComb, + &adat->shiftVec, &nbp->nbfp, &nbp->nbfp_comb, &nbp->coulomb_tab, @@ -736,15 +736,15 @@ void gpu_launch_kernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const Nb { const auto kernelArgs = prepareGpuKernelArguments(kernel, config, - &adat->ntypes, + &adat->numTypes, &nbparams_params, &adat->xq, &adat->f, - &adat->e_lj, - &adat->e_el, - &adat->fshift, - &adat->atom_types, - &adat->shift_vec, + &adat->eLJ, + &adat->eElec, + &adat->fShift, + &adat->atomTypes, + &adat->shiftVec, &nbp->nbfp, &nbp->nbfp_comb, &nbp->coulomb_tab, @@ -793,7 +793,7 @@ static inline int calc_shmem_required_prune(const int num_threads_z) */ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, const int numParts) { - cl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; cl_timers_t* t = nb->timers; @@ -898,7 +898,7 @@ void gpu_launch_kernel_pruneonly(NbnxmGpu* nb, const InteractionLocality iloc, c config, &nbparams_params, &adat->xq, - &adat->shift_vec, + &adat->shiftVec, &plist->sci, &plist->cj4, &plist->imask, @@ -945,7 +945,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, "Non-local stream is indicating that the copy back event is enqueued at the " "beginning of the copy back function."); - cl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; cl_timers_t* t = nb->timers; bool bDoTime = nb->bDoTime; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; @@ -1013,10 +1013,10 @@ void gpu_launch_cpyback(NbnxmGpu* nb, if (stepWork.computeVirial) { static_assert( - sizeof(*nb->nbst.fshift) == sizeof(Float3), + sizeof(*nb->nbst.fShift) == sizeof(Float3), "Sizes of host- and device-side shift vector elements should be the same."); - copyFromDeviceBuffer(nb->nbst.fshift, - &adat->fshift, + copyFromDeviceBuffer(nb->nbst.fShift, + &adat->fShift, 0, SHIFTS, deviceStream, @@ -1027,20 +1027,20 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH energies */ if (stepWork.computeEnergy) { - static_assert(sizeof(*nb->nbst.e_lj) == sizeof(float), + static_assert(sizeof(*nb->nbst.eLJ) == sizeof(float), "Sizes of host- and device-side LJ energy terms should be the same."); - copyFromDeviceBuffer(nb->nbst.e_lj, - &adat->e_lj, + copyFromDeviceBuffer(nb->nbst.eLJ, + &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, bDoTime ? t->xf[aloc].nb_d2h.fetchNextEvent() : nullptr); - static_assert(sizeof(*nb->nbst.e_el) == sizeof(float), + static_assert(sizeof(*nb->nbst.eElec) == sizeof(float), "Sizes of host- and device-side electrostatic energy terms should be the " "same."); - copyFromDeviceBuffer(nb->nbst.e_el, - &adat->e_el, + copyFromDeviceBuffer(nb->nbst.eElec, + &adat->eElec, 0, 1, deviceStream, diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp index da998182ca..25cb3158b2 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_data_mgmt.cpp @@ -103,16 +103,16 @@ static unsigned int gpu_min_ci_balanced_factor = 50; /*! \brief Initializes the atomdata structure first time, it only gets filled at pair-search. */ -static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, const DeviceContext& deviceContext) +static void init_atomdata_first(NBAtomData* ad, int ntypes, const DeviceContext& deviceContext) { - ad->ntypes = ntypes; + ad->numTypes = ntypes; - allocateDeviceBuffer(&ad->shift_vec, SHIFTS * DIM, deviceContext); - ad->bShiftVecUploaded = CL_FALSE; + allocateDeviceBuffer(&ad->shiftVec, SHIFTS * DIM, deviceContext); + ad->shiftVecUploaded = false; - allocateDeviceBuffer(&ad->fshift, SHIFTS * DIM, deviceContext); - allocateDeviceBuffer(&ad->e_lj, 1, deviceContext); - allocateDeviceBuffer(&ad->e_el, 1, deviceContext); + allocateDeviceBuffer(&ad->fShift, SHIFTS * DIM, deviceContext); + allocateDeviceBuffer(&ad->eLJ, 1, deviceContext); + allocateDeviceBuffer(&ad->eElec, 1, deviceContext); /* initialize to nullptr pointers to data that is not allocated here and will need reallocation in nbnxn_gpu_init_atomdata */ @@ -120,8 +120,8 @@ static void init_atomdata_first(cl_atomdata_t* ad, int ntypes, const DeviceConte ad->f = nullptr; /* size -1 indicates that the respective array hasn't been initialized yet */ - ad->natoms = -1; - ad->nalloc = -1; + ad->numAtoms = -1; + ad->numAtomsAlloc = -1; } @@ -206,7 +206,7 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb) { cl_int cl_error; - cl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; cl_command_queue ls = nb->deviceStreams[InteractionLocality::Local]->stream(); size_t local_work_size[3] = { 1, 1, 1 }; @@ -223,9 +223,9 @@ static void nbnxn_ocl_clear_e_fshift(NbnxmGpu* nb) global_work_size[0] = ((shifts + local_work_size[0] - 1) / local_work_size[0]) * local_work_size[0]; arg_no = 0; - cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift)); - cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj)); - cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el)); + cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fShift)); + cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eLJ)); + cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->eElec)); cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts); GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str()); @@ -264,7 +264,7 @@ static void nbnxn_gpu_init_kernels(NbnxmGpu* nb) * Initializes members of the atomdata and nbparam structs and * clears e/fshift output buffers. */ -static void nbnxn_ocl_init_const(cl_atomdata_t* atomData, +static void nbnxn_ocl_init_const(NBAtomData* atomData, NBParamGpu* nbParams, const interaction_const_t* ic, const PairlistParams& listParams, @@ -304,9 +304,9 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, nb->dev_rundata = new gmx_device_runtime_data_t(); /* init nbst */ - pmalloc(reinterpret_cast(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj)); - pmalloc(reinterpret_cast(&nb->nbst.e_el), sizeof(*nb->nbst.e_el)); - pmalloc(reinterpret_cast(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift)); + pmalloc(reinterpret_cast(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ)); + pmalloc(reinterpret_cast(&nb->nbst.eElec), sizeof(*nb->nbst.eElec)); + pmalloc(reinterpret_cast(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift)); init_plist(nb->plist[InteractionLocality::Local]); @@ -372,7 +372,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear) return; } - cl_atomdata_t* atomData = nb->atdat; + NBAtomData* atomData = nb->atdat; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; clearDeviceBufferAsync(&atomData->f, 0, natoms_clear, localStream); @@ -381,7 +381,7 @@ static void nbnxn_ocl_clear_f(NbnxmGpu* nb, int natoms_clear) //! This function is documented in the header file void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) { - nbnxn_ocl_clear_f(nb, nb->atdat->natoms); + nbnxn_ocl_clear_f(nb, nb->atdat->numAtoms); /* clear shift force array and energies if the outputs were used in the current step */ if (computeVirial) @@ -398,22 +398,22 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) //! This function is documented in the header file void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { - cl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; /* only if we have a dynamic box */ - if (nbatom->bDynamicBox || !adat->bShiftVecUploaded) + if (nbatom->bDynamicBox || !adat->shiftVecUploaded) { static_assert(sizeof(Float3) == sizeof(nbatom->shift_vec[0]), "Sizes of host- and device-side shift vectors should be the same."); - copyToDeviceBuffer(&adat->shift_vec, + copyToDeviceBuffer(&adat->shiftVec, reinterpret_cast(nbatom->shift_vec.data()), 0, SHIFTS, localStream, GpuApiCallBehavior::Async, nullptr); - adat->bShiftVecUploaded = CL_TRUE; + adat->shiftVecUploaded = true; } } @@ -425,7 +425,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) bool realloced; bool bDoTime = nb->bDoTime; cl_timers_t* timers = nb->timers; - cl_atomdata_t* d_atdat = nb->atdat; + NBAtomData* d_atdat = nb->atdat; const DeviceContext& deviceContext = *nb->deviceContext_; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; @@ -440,17 +440,17 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) /* need to reallocate if we have to copy more atoms than the amount of space available and only allocate if we haven't initialized yet, i.e d_atdat->natoms == -1 */ - if (natoms > d_atdat->nalloc) + if (natoms > d_atdat->numAtomsAlloc) { nalloc = over_alloc_small(natoms); /* free up first if the arrays have already been initialized */ - if (d_atdat->nalloc != -1) + if (d_atdat->numAtomsAlloc != -1) { freeDeviceBuffer(&d_atdat->f); freeDeviceBuffer(&d_atdat->xq); - freeDeviceBuffer(&d_atdat->lj_comb); - freeDeviceBuffer(&d_atdat->atom_types); + freeDeviceBuffer(&d_atdat->ljComb); + freeDeviceBuffer(&d_atdat->atomTypes); } @@ -460,19 +460,19 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) if (useLjCombRule(nb->nbparam->vdwType)) { // Two Lennard-Jones parameters per atom - allocateDeviceBuffer(&d_atdat->lj_comb, nalloc, deviceContext); + allocateDeviceBuffer(&d_atdat->ljComb, nalloc, deviceContext); } else { - allocateDeviceBuffer(&d_atdat->atom_types, nalloc, deviceContext); + allocateDeviceBuffer(&d_atdat->atomTypes, nalloc, deviceContext); } - d_atdat->nalloc = nalloc; - realloced = true; + d_atdat->numAtomsAlloc = nalloc; + realloced = true; } - d_atdat->natoms = natoms; - d_atdat->natoms_local = nbat->natoms_local; + d_atdat->numAtoms = natoms; + d_atdat->numAtomsLocal = nbat->natoms_local; /* need to clear GPU f output if realloc happened */ if (realloced) @@ -484,7 +484,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) { static_assert(sizeof(float) == sizeof(*nbat->params().lj_comb.data()), "Size of the LJ parameters element should be equal to the size of float2."); - copyToDeviceBuffer(&d_atdat->lj_comb, + copyToDeviceBuffer(&d_atdat->ljComb, reinterpret_cast(nbat->params().lj_comb.data()), 0, natoms, @@ -496,7 +496,7 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) { static_assert(sizeof(int) == sizeof(*nbat->params().type.data()), "Sizes of host- and device-side atom types should be the same."); - copyToDeviceBuffer(&d_atdat->atom_types, + copyToDeviceBuffer(&d_atdat->atomTypes, nbat->params().type.data(), 0, natoms, @@ -592,12 +592,12 @@ void gpu_free(NbnxmGpu* nb) /* Free atdat */ freeDeviceBuffer(&(nb->atdat->xq)); freeDeviceBuffer(&(nb->atdat->f)); - freeDeviceBuffer(&(nb->atdat->e_lj)); - freeDeviceBuffer(&(nb->atdat->e_el)); - freeDeviceBuffer(&(nb->atdat->fshift)); - freeDeviceBuffer(&(nb->atdat->lj_comb)); - freeDeviceBuffer(&(nb->atdat->atom_types)); - freeDeviceBuffer(&(nb->atdat->shift_vec)); + freeDeviceBuffer(&(nb->atdat->eLJ)); + freeDeviceBuffer(&(nb->atdat->eElec)); + freeDeviceBuffer(&(nb->atdat->fShift)); + freeDeviceBuffer(&(nb->atdat->ljComb)); + freeDeviceBuffer(&(nb->atdat->atomTypes)); + freeDeviceBuffer(&(nb->atdat->shiftVec)); sfree(nb->atdat); /* Free nbparam */ @@ -624,14 +624,14 @@ void gpu_free(NbnxmGpu* nb) } /* Free nbst */ - pfree(nb->nbst.e_lj); - nb->nbst.e_lj = nullptr; + pfree(nb->nbst.eLJ); + nb->nbst.eLJ = nullptr; - pfree(nb->nbst.e_el); - nb->nbst.e_el = nullptr; + pfree(nb->nbst.eElec); + nb->nbst.eElec = nullptr; - pfree(nb->nbst.fshift); - nb->nbst.fshift = nullptr; + pfree(nb->nbst.fShift); + nb->nbst.fShift = nullptr; freeGpuProgram(nb->dev_rundata->program); delete nb->dev_rundata; diff --git a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h index 751e352962..474d90700d 100644 --- a/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h +++ b/src/gromacs/nbnxm/opencl/nbnxm_ocl_types.h @@ -74,62 +74,6 @@ enum ePruneKind ePruneNR }; -/*! \internal - * \brief Staging area for temporary data downloaded from the GPU. - * - * The energies/shift forces get downloaded here first, before getting added - * to the CPU-side aggregate values. - */ -struct nb_staging_t -{ - //! LJ energy - float* e_lj = nullptr; - //! electrostatic energy - float* e_el = nullptr; - //! float3 buffer with shift forces - Float3* fshift = nullptr; -}; - -/*! \internal - * \brief Nonbonded atom data - both inputs and outputs. - */ -typedef struct cl_atomdata -{ - //! number of atoms - int natoms; - //! number of local atoms - int natoms_local; - //! allocation size for the atom data (xq, f) - int nalloc; - - //! float4 buffer with atom coordinates + charges, size natoms - DeviceBuffer xq; - - //! float3 buffer with force output array, size natoms - DeviceBuffer f; - - //! LJ energy output, size 1 - DeviceBuffer e_lj; - //! Electrostatics energy input, size 1 - DeviceBuffer e_el; - - //! float3 buffer with shift forces - DeviceBuffer fshift; - - //! number of atom types - int ntypes; - //! int buffer with atom type indices, size natoms - DeviceBuffer atom_types; - //! float2 buffer with sqrt(c6),sqrt(c12), size natoms - DeviceBuffer lj_comb; - - //! float3 buffer with shifts values - DeviceBuffer shift_vec; - - //! true if the shift vector has been uploaded - bool bShiftVecUploaded; -} cl_atomdata_t; - /*! \internal * \brief Data structure shared between the OpenCL device code and OpenCL host code * @@ -229,13 +173,13 @@ struct NbnxmGpu bool bNonLocalStreamDoneMarked = false; //! atom data - cl_atomdata_t* atdat = nullptr; + NBAtomData* atdat = nullptr; //! parameters required for the non-bonded calc. NBParamGpu* nbparam = nullptr; //! pair-list data structures (local and non-local) gmx::EnumerationArray plist = { nullptr }; //! staging area where fshift/energies get downloaded - nb_staging_t nbst; + NBStagingData nbst; //! local and non-local GPU queues gmx::EnumerationArray deviceStreams; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp index c313deb110..60ba8b32f4 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl.cpp @@ -87,7 +87,7 @@ void gpu_launch_cpyback(NbnxmGpu* nb, "beginning of the copy back function."); const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; - sycl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; /* don't launch non-local copy-back if there was no non-local work to do */ if ((iloc == InteractionLocality::NonLocal) && !haveGpuShortRangeWork(*nb, iloc)) @@ -138,24 +138,24 @@ void gpu_launch_cpyback(NbnxmGpu* nb, /* DtoH fshift when virial is needed */ if (stepWork.computeVirial) { - GMX_ASSERT(sizeof(*nb->nbst.fshift) == adat->fShift.elementSize(), + GMX_ASSERT(sizeof(*nb->nbst.fShift) == adat->fShift.elementSize(), "Sizes of host- and device-side shift vector elements should be the same."); copyFromDeviceBuffer( - nb->nbst.fshift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr); + nb->nbst.fShift, &adat->fShift, 0, SHIFTS, deviceStream, GpuApiCallBehavior::Async, nullptr); } /* DtoH energies */ if (stepWork.computeEnergy) { - GMX_ASSERT(sizeof(*nb->nbst.e_lj) == sizeof(float), + GMX_ASSERT(sizeof(*nb->nbst.eLJ) == sizeof(float), "Sizes of host- and device-side LJ energy terms should be the same."); copyFromDeviceBuffer( - nb->nbst.e_lj, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); - GMX_ASSERT(sizeof(*nb->nbst.e_el) == sizeof(float), + nb->nbst.eLJ, &adat->eLJ, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); + GMX_ASSERT(sizeof(*nb->nbst.eElec) == sizeof(float), "Sizes of host- and device-side electrostatic energy terms should be the " "same."); copyFromDeviceBuffer( - nb->nbst.e_el, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); + nb->nbst.eElec, &adat->eElec, 0, 1, deviceStream, GpuApiCallBehavior::Async, nullptr); } } } @@ -168,7 +168,7 @@ void gpu_copy_xq_to_gpu(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom, const Atom const InteractionLocality iloc = gpuAtomToInteractionLocality(atomLocality); - sycl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; gpu_plist* plist = nb->plist[iloc]; const DeviceStream& deviceStream = *nb->deviceStreams[iloc]; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp index e033a507e8..0f67f04f0a 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_data_mgmt.cpp @@ -59,10 +59,10 @@ namespace Nbnxm //! This function is documented in the header file void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) { - sycl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; // Clear forces - clearDeviceBufferAsync(&adat->f, 0, nb->atdat->natoms, localStream); + clearDeviceBufferAsync(&adat->f, 0, nb->atdat->numAtoms, localStream); // Clear shift force array and energies if the outputs were used in the current step if (computeVirial) { @@ -76,7 +76,7 @@ void gpu_clear_outputs(NbnxmGpu* nb, bool computeVirial) static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& deviceContext) { const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; - sycl_atomdata_t* atomdata = nb->atdat; + NBAtomData* atomdata = nb->atdat; atomdata->numTypes = numTypes; allocateDeviceBuffer(&atomdata->shiftVec, SHIFTS, deviceContext); atomdata->shiftVecUploaded = false; @@ -95,8 +95,8 @@ static void initAtomdataFirst(NbnxmGpu* nb, int numTypes, const DeviceContext& d atomdata->f = nullptr; /* size -1 indicates that the respective array hasn't been initialized yet */ - atomdata->natoms = -1; - atomdata->numAlloc = -1; + atomdata->numAtoms = -1; + atomdata->numAtomsAlloc = -1; } /*! \brief Initialize the nonbonded parameter data structure. */ @@ -144,7 +144,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, { auto* nb = new NbnxmGpu(); nb->deviceContext_ = &deviceStreamManager.context(); - nb->atdat = new sycl_atomdata_t; + nb->atdat = new NBAtomData; nb->nbparam = new NBParamGpu; nb->plist[InteractionLocality::Local] = new Nbnxm::gpu_plist; if (bLocalAndNonlocal) @@ -158,9 +158,9 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, nb->timings = nullptr; /* init nbst */ - pmalloc(reinterpret_cast(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj)); - pmalloc(reinterpret_cast(&nb->nbst.e_el), sizeof(*nb->nbst.e_el)); - pmalloc(reinterpret_cast(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift)); + pmalloc(reinterpret_cast(&nb->nbst.eLJ), sizeof(*nb->nbst.eLJ)); + pmalloc(reinterpret_cast(&nb->nbst.eElec), sizeof(*nb->nbst.eElec)); + pmalloc(reinterpret_cast(&nb->nbst.fShift), SHIFTS * sizeof(*nb->nbst.fShift)); init_plist(nb->plist[InteractionLocality::Local]); @@ -195,7 +195,7 @@ NbnxmGpu* gpu_init(const gmx::DeviceStreamManager& deviceStreamManager, void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) { - sycl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; /* only if we have a dynamic box */ @@ -217,18 +217,18 @@ void gpu_upload_shiftvec(NbnxmGpu* nb, const nbnxn_atomdata_t* nbatom) void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) { GMX_ASSERT(!nb->bDoTime, "Timing on SYCL not supported yet"); - sycl_atomdata_t* atdat = nb->atdat; + NBAtomData* atdat = nb->atdat; const DeviceContext& deviceContext = *nb->deviceContext_; const DeviceStream& localStream = *nb->deviceStreams[InteractionLocality::Local]; int numAtoms = nbat->numAtoms(); bool reallocated = false; - if (numAtoms > atdat->numAlloc) + if (numAtoms > atdat->numAtomsAlloc) { int numAlloc = over_alloc_small(numAtoms); /* free up first if the arrays have already been initialized */ - if (atdat->numAlloc != -1) + if (atdat->numAtomsAlloc != -1) { freeDeviceBuffer(&atdat->f); freeDeviceBuffer(&atdat->xq); @@ -247,17 +247,17 @@ void gpu_init_atomdata(NbnxmGpu* nb, const nbnxn_atomdata_t* nbat) allocateDeviceBuffer(&atdat->atomTypes, numAlloc, deviceContext); } - atdat->numAlloc = numAlloc; - reallocated = true; + atdat->numAtomsAlloc = numAlloc; + reallocated = true; } - atdat->natoms = numAtoms; - atdat->natoms_local = nbat->natoms_local; + atdat->numAtoms = numAtoms; + atdat->numAtomsLocal = nbat->natoms_local; /* need to clear GPU f output if realloc happened */ if (reallocated) { - clearDeviceBufferAsync(&atdat->f, 0, atdat->numAlloc, localStream); + clearDeviceBufferAsync(&atdat->f, 0, atdat->numAtomsAlloc, localStream); } if (useLjCombRule(nb->nbparam->vdwType)) @@ -293,8 +293,8 @@ void gpu_free(NbnxmGpu* nb) return; } - sycl_atomdata_t* atdat = nb->atdat; - NBParamGpu* nbparam = nb->nbparam; + NBAtomData* atdat = nb->atdat; + NBParamGpu* nbparam = nb->nbparam; if ((!nbparam->coulomb_tab) && (nbparam->elecType == ElecType::EwaldTab || nbparam->elecType == ElecType::EwaldTabTwin)) @@ -322,14 +322,14 @@ void gpu_free(NbnxmGpu* nb) } /* Free nbst */ - pfree(nb->nbst.e_lj); - nb->nbst.e_lj = nullptr; + pfree(nb->nbst.eLJ); + nb->nbst.eLJ = nullptr; - pfree(nb->nbst.e_el); - nb->nbst.e_el = nullptr; + pfree(nb->nbst.eElec); + nb->nbst.eElec = nullptr; - pfree(nb->nbst.fshift); - nb->nbst.fshift = nullptr; + pfree(nb->nbst.fShift); + nb->nbst.fShift = nullptr; delete atdat; delete nbparam; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp index 946eb2dd4d..770732c70c 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel.cpp @@ -962,7 +962,7 @@ cl::sycl::event chooseAndLaunchNbnxmKernel(bool doPruneNBL, void launchNbnxmKernel(NbnxmGpu* nb, const gmx::StepWorkload& stepWork, const InteractionLocality iloc) { - sycl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; const bool doPruneNBL = (plist->haveFreshList && !nb->didPrune[iloc]); diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp index cf9ce2f67d..62f4bb7592 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_kernel_pruneonly.cpp @@ -264,7 +264,7 @@ void launchNbnxmKernelPruneOnly(NbnxmGpu* nb, const int part, const int numSciInPart) { - sycl_atomdata_t* adat = nb->atdat; + NBAtomData* adat = nb->atdat; NBParamGpu* nbp = nb->nbparam; gpu_plist* plist = nb->plist[iloc]; const bool haveFreshList = plist->haveFreshList; diff --git a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h index ba14d9b867..c1e23c1a74 100644 --- a/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h +++ b/src/gromacs/nbnxm/sycl/nbnxm_sycl_types.h @@ -55,60 +55,6 @@ #include "gromacs/timing/gpu_timing.h" #include "gromacs/utility/enumerationhelpers.h" -/*! \internal - * \brief Staging area for temporary data downloaded from the GPU. - * - * Since SYCL buffers already have host-side storage, this is a bit redundant. - * But it allows prefetching of the data from GPU, and brings GPU backends closer together. - */ -struct nb_staging_t -{ - //! LJ energy - float* e_lj = nullptr; - //! electrostatic energy - float* e_el = nullptr; - //! shift forces - Float3* fshift = nullptr; -}; - -/** \internal - * \brief Nonbonded atom data - both inputs and outputs. - */ -struct sycl_atomdata_t -{ - //! number of atoms - int natoms; - //! number of local atoms - int natoms_local; // - //! allocation size for the atom data (xq, f) - int numAlloc; - - //! atom coordinates + charges, size \ref natoms - DeviceBuffer xq; - //! force output array, size \ref natoms - DeviceBuffer f; - - //! LJ energy output, size 1 - DeviceBuffer eLJ; - //! Electrostatics energy input, size 1 - DeviceBuffer eElec; - - //! shift forces - DeviceBuffer fShift; - - //! number of atom types - int numTypes; - //! atom type indices, size \ref natoms - DeviceBuffer atomTypes; - //! sqrt(c6),sqrt(c12) size \ref natoms - DeviceBuffer ljComb; - - //! shifts - DeviceBuffer shiftVec; - //! true if the shift vector has been uploaded - bool shiftVecUploaded; -}; - class GpuEventSynchronizer; /*! \internal @@ -126,13 +72,13 @@ struct NbnxmGpu /*! \brief true indicates that the nonlocal_done event was marked */ bool bNonLocalStreamDoneMarked = false; /*! \brief atom data */ - sycl_atomdata_t* atdat = nullptr; + NBAtomData* atdat = nullptr; NBParamGpu* nbparam = nullptr; /*! \brief pair-list data structures (local and non-local) */ gmx::EnumerationArray plist = { { nullptr } }; /*! \brief staging area where fshift/energies get downloaded. Will be removed in SYCL. */ - nb_staging_t nbst; + NBStagingData nbst; /*! \brief local and non-local GPU streams */ gmx::EnumerationArray deviceStreams; -- 2.22.0